aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--youtube_dl/extractor/__init__.py2
-rw-r--r--youtube_dl/extractor/canalplus.py46
-rw-r--r--youtube_dl/extractor/common.py29
-rw-r--r--youtube_dl/extractor/criterion.py14
-rw-r--r--youtube_dl/extractor/cspan.py4
-rw-r--r--youtube_dl/extractor/dailymotion.py5
-rw-r--r--youtube_dl/extractor/ehow.py11
-rw-r--r--youtube_dl/extractor/escapist.py8
-rw-r--r--youtube_dl/extractor/flickr.py15
-rw-r--r--youtube_dl/extractor/funnyordie.py5
-rw-r--r--youtube_dl/extractor/gametrailers.py65
-rw-r--r--youtube_dl/extractor/hotnewhiphop.py8
-rw-r--r--youtube_dl/extractor/instagram.py17
-rw-r--r--youtube_dl/extractor/keek.py3
-rw-r--r--youtube_dl/extractor/liveleak.py6
-rw-r--r--youtube_dl/extractor/livestream.py52
-rw-r--r--youtube_dl/extractor/mtv.py151
-rw-r--r--youtube_dl/extractor/nba.py3
-rw-r--r--youtube_dl/extractor/statigram.py10
-rw-r--r--youtube_dl/extractor/teamcoco.py15
-rw-r--r--youtube_dl/extractor/traileraddict.py9
-rw-r--r--youtube_dl/extractor/tutv.py4
-rw-r--r--youtube_dl/extractor/vine.py10
-rw-r--r--youtube_dl/extractor/youjizz.py16
24 files changed, 299 insertions, 209 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 7b177e343..7a2a09ab0 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -6,6 +6,7 @@ from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
+from .canalplus import CanalplusIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
from .criterion import CriterionIE
@@ -37,6 +38,7 @@ from .jukebox import JukeboxIE
from .justintv import JustinTVIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
+from .livestream import LivestreamIE
from .metacafe import MetacafeIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
new file mode 100644
index 000000000..3b1c88876
--- /dev/null
+++ b/youtube_dl/extractor/canalplus.py
@@ -0,0 +1,46 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+class CanalplusIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
+ IE_NAME = u'canalplus.fr'
+
+ _TEST = {
+ u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861',
+ u'file': u'889861.flv',
+ u'md5': u'590a888158b5f0d6832f84001fbf3e99',
+ u'info_dict': {
+ u'title': u'Le Petit Journal 20/06/13 - La guerre des drone',
+ u'upload_date': u'20130620',
+ },
+ u'skip': u'Requires rtmpdump'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_url = self._VIDEO_INFO_TEMPLATE % video_id
+ info_page = self._download_webpage(info_url,video_id,
+ u'Downloading video info')
+
+ self.report_extraction(video_id)
+ doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
+ video_info = [video for video in doc if video.find('ID').text == video_id][0]
+ infos = video_info.find('INFOS')
+ media = video_info.find('MEDIA')
+ formats = [media.find('VIDEOS/%s' % format)
+ for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']]
+ video_url = [format.text for format in formats if format is not None][-1]
+
+ return {'id': video_id,
+ 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text,
+ infos.find('TITRAGE/SOUS_TITRE').text),
+ 'url': video_url,
+ 'ext': 'flv',
+ 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
+ 'thumbnail': media.find('IMAGES/GRAND').text,
+ }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 1bd5538ca..ec988fc90 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -125,6 +125,11 @@ class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
+
+ # Strip hashes from the URL (#1038)
+ if isinstance(url_or_request, (compat_str, str)):
+ url_or_request = url_or_request.partition('#')[0]
+
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -257,6 +262,30 @@ class InfoExtractor(object):
return (username, password)
+ # Helper functions for extracting OpenGraph info
+ @staticmethod
+ def _og_regex(prop):
+ return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+
+ def _og_search_property(self, prop, html, name=None, **kargs):
+ if name is None:
+ name = 'OpenGraph %s' % prop
+ return self._html_search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+
+ def _og_search_thumbnail(self, html, **kargs):
+ return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+
+ def _og_search_description(self, html, **kargs):
+ return self._og_search_property('description', html, fatal=False, **kargs)
+
+ def _og_search_title(self, html, **kargs):
+ return self._og_search_property('title', html, **kargs)
+
+ def _og_search_video_url(self, html, name='video url', **kargs):
+ return self._html_search_regex([self._og_regex('video:secure_url'),
+ self._og_regex('video')],
+ html, name, **kargs)
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py
index a149d2900..31fe3d57b 100644
--- a/youtube_dl/extractor/criterion.py
+++ b/youtube_dl/extractor/criterion.py
@@ -3,38 +3,38 @@
import re
from .common import InfoExtractor
+from ..utils import determine_ext
class CriterionIE(InfoExtractor):
- _VALID_URL = r'http://www.criterion.com/films/(.*)'
+ _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+'
_TEST = {
u'url': u'http://www.criterion.com/films/184-le-samourai',
u'file': u'184.mp4',
u'md5': u'bc51beba55685509883a9a7830919ec3',
u'info_dict': {
u"title": u"Le Samouraï",
- u"description" : u"In a career-defining performance, Alain Delon plays a contract killer with samurai instincts. A razor-sharp cocktail of 1940s American gangster cinema and 1960s French pop culture, maverick director Jean-Pierre Melville&#x27;s masterpiece _Le Samouraï_ defines cool. "
+ u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1).split('-')[0]
+ video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;',
webpage, 'video url')
- title = self._search_regex(r'<meta content="(.+?)" property="og:title" />',
+ title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />',
webpage, 'video title')
- description = self._search_regex(r'<meta name="description" content="(.+?)" />',
+ description = self._html_search_regex(r'<meta name="description" content="(.+?)" />',
webpage, 'video description')
thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
webpage, 'thumbnail url')
- ext = final_url.split('.')[-1]
return {'id': video_id,
'url' : final_url,
'title': title,
- 'ext': ext,
+ 'ext': determine_ext(final_url),
'description': description,
'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index a4853279b..7bf03c584 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor):
description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
webpage, 'description',
flags=re.MULTILINE|re.DOTALL)
- thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"',
- webpage, 'thumbnail')
url = self._search_regex(r'<string name="URL">(.*?)</string>',
video_info, 'video url')
@@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor):
'url': url,
'play_path': path,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 5fd2221a7..9bf7a28ca 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -39,9 +39,6 @@ class DailymotionIE(InfoExtractor):
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
- video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
- webpage, 'title')
-
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,7 +73,7 @@ class DailymotionIE(InfoExtractor):
'url': video_url,
'uploader': video_uploader,
'upload_date': video_upload_date,
- 'title': video_title,
+ 'title': self._og_search_title(webpage),
'ext': video_extension,
'thumbnail': info['thumbnail_url']
}]
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index 1f0b3888e..2bb77aec6 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -28,14 +28,9 @@ class EHowIE(InfoExtractor):
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
webpage, u'video URL')
final_url = compat_urllib_parse.unquote(video_url)
- thumbnail_url = self._search_regex(r'<meta property="og:image" content="(.+?)" />',
- webpage, u'thumbnail URL')
uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
webpage, u'uploader')
- title = self._search_regex(r'<meta property="og:title" content="(.+?)" />',
- webpage, u'Video title').replace(' | eHow', '')
- description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
- webpage, u'video description')
+ title = self._og_search_title(webpage).replace(' | eHow', '')
ext = determine_ext(final_url)
return {
@@ -44,8 +39,8 @@ class EHowIE(InfoExtractor):
'url': final_url,
'ext': ext,
'title': title,
- 'thumbnail': thumbnail_url,
- 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
'uploader': uploader,
}
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 794460e84..3aa2da52c 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor):
videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
- webpage, u'thumbnail', fatal=False)
-
- playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
- webpage, u'player url')
+ playerUrl = self._og_search_video_url(webpage, name='player url')
title = self._html_search_regex('<meta name="title" content="([^"]*)"',
webpage, u'player url').split(' : ')[-1]
@@ -70,7 +66,7 @@ class EscapistIE(InfoExtractor):
'upload_date': None,
'title': title,
'ext': 'mp4',
- 'thumbnail': imgUrl,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'description': videoDesc,
'player_url': playerUrl,
}
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py
index bd97bff9a..80d96baf7 100644
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -47,21 +47,12 @@ class FlickrIE(InfoExtractor):
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
- video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
- webpage, u'video title')
-
- video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
- webpage, u'description', fatal=False)
-
- thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
- webpage, u'thumbnail', fatal=False)
-
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'description': video_description,
- 'thumbnail': thumbnail,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id': video_uploader_id,
}]
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 388aacf2f..67a7e5f76 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor):
title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
- video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
- webpage, u'description', fatal=False, flags=re.DOTALL)
-
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
- 'description': video_description,
+ 'description': self._og_search_description(webpage),
}
return [info]
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py
index cd438bd2f..3cc02d97e 100644
--- a/youtube_dl/extractor/gametrailers.py
+++ b/youtube_dl/extractor/gametrailers.py
@@ -1,63 +1,36 @@
import re
-import xml.etree.ElementTree
-from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse,
+from .mtv import MTVIE, _media_xml_tag
- ExtractorError,
-)
-
-class GametrailersIE(InfoExtractor):
+class GametrailersIE(MTVIE):
+ """
+ Gametrailers use the same videos system as MTVIE, it just changes the feed
+ url, where the uri is and the method to get the thumbnails.
+ """
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
- u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.flv',
- u'md5': u'c3edbc995ab4081976e16779bd96a878',
+ u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
+ u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7',
u'info_dict': {
- u"title": u"E3 2013: Debut Trailer"
+ u'title': u'E3 2013: Debut Trailer',
+ u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
- u'skip': u'Requires rtmpdump'
}
+ # Overwrite MTVIE properties we don't want
+ _TESTS = []
+
+ _FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
+
+ def _get_thumbnail_url(self, uri, itemdoc):
+ search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+ return itemdoc.find(search_path).attrib['url']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
webpage, u'mgid')
-
- data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
- info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
- video_id, u'Downloading video info')
- doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
- default_thumb = doc.find('./channel/image/url').text
-
- media_namespace = {'media': 'http://search.yahoo.com/mrss/'}
- parts = [{
- 'title': video_doc.find('title').text,
- 'ext': 'flv',
- 'id': video_doc.find('guid').text.rpartition(':')[2],
- # Videos are actually flv not mp4
- 'url': self._get_video_url(video_doc.find('media:group/media:content', media_namespace).attrib['url'], video_id),
- # The thumbnail may not be defined, it would be ''
- 'thumbnail': video_doc.find('media:group/media:thumbnail', media_namespace).attrib['url'] or default_thumb,
- 'description': video_doc.find('description').text,
- } for video_doc in doc.findall('./channel/item')]
- return parts
-
- def _get_video_url(self, mediagen_url, video_id):
- if 'acceptMethods' not in mediagen_url:
- mediagen_url += '&acceptMethods=fms'
- links_webpage = self._download_webpage(mediagen_url,
- video_id, u'Downloading video urls info')
- doc = xml.etree.ElementTree.fromstring(links_webpage)
- urls = list(doc.iter('src'))
- if len(urls) == 0:
- raise ExtractorError(u'Unable to extract video url')
- # They are sorted from worst to best quality
- return urls[-1].text
-
+ return self._get_videos_info(mgid)
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py
index ca3abb7d7..ccca1d7e0 100644
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor):
video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
-
- # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._html_search_regex(r'"og:image" content="(.*)"',
- webpage_src, u'thumbnail', fatal=False)
results = [{
'id': video_id,
'url' : video_url,
'title' : video_title,
- 'thumbnail' : thumbnail,
+ 'thumbnail' : self._og_search_thumbnail(webpage_src),
'ext' : 'mp3',
}]
- return results \ No newline at end of file
+ return results
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 6ae704efd..f9ac8d5b4 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
_TEST = {
- u'url': u'http://instagram.com/p/aye83DjauH/#',
+ u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',
u'md5': u'0d2da106a9d2631273e192b372806516',
u'info_dict': {
@@ -18,25 +18,20 @@ class InstagramIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'<meta property="og:video" content="(.+?)"',
- webpage, u'video URL')
- thumbnail_url = self._html_search_regex(
- r'<meta property="og:image" content="(.+?)" />',
- webpage, u'thumbnail URL', fatal=False)
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, u'title', flags=re.DOTALL)
title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip()
- uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram',
- webpage, u'uploader name', fatal=False)
+ uploader_id = self._html_search_regex(
+ r'<div class="media-user" id="media_user">.*?<h2><a href="[^"]*">([^<]*)</a></h2>',
+ webpage, u'uploader id', fatal=False, flags=re.DOTALL)
ext = 'mp4'
return [{
'id': video_id,
- 'url': video_url,
+ 'url': self._og_search_video_url(webpage),
'ext': ext,
'title': title,
- 'thumbnail': thumbnail_url,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id
}]
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
index 72ad6a3d0..dda78743d 100644
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -24,8 +24,7 @@ class KeekIE(InfoExtractor):
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
- webpage, u'title')
+ video_title = self._og_search_title(webpage)
uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
webpage, u'uploader', fatal=False)
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index cf8a2c931..dd062a14e 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor):
video_url = self._search_regex(r'file: "(.*?)",',
webpage, u'video URL')
- video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
- webpage, u'title').replace('LiveLeak.com -', '').strip()
+ video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
- video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
- webpage, u'description', fatal=False)
+ video_description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
webpage, u'uploader', fatal=False)
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
new file mode 100644
index 000000000..309921078
--- /dev/null
+++ b/youtube_dl/extractor/livestream.py
@@ -0,0 +1,52 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse_urlparse, compat_urlparse
+
+
+class LivestreamIE(InfoExtractor):
+ _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
+ _TEST = {
+ u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
+ u'file': u'4719370.mp4',
+ u'md5': u'0d2186e3187d185a04b3cdd02b828836',
+ u'info_dict': {
+ u'title': u'Live from Webster Hall NYC',
+ u'upload_date': u'20121012',
+ }
+ }
+
+ def _extract_video_info(self, video_data):
+ video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url')
+ return {'id': video_data['id'],
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': video_data['caption'],
+ 'thumbnail': video_data['thumbnail_url'],
+ 'upload_date': video_data['updated_at'].replace('-','')[:8],
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ event_name = mobj.group('event_name')
+ webpage = self._download_webpage(url, video_id or event_name)
+
+ if video_id is None:
+ # This is an event page:
+ api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'',
+ webpage, 'api url')
+ info = json.loads(self._download_webpage(api_url, event_name,
+ u'Downloading event info'))
+ videos = [self._extract_video_info(video_data['data'])
+ for video_data in info['feed']['data'] if video_data['type'] == u'video']
+ return self.playlist_result(videos, info['id'], info['full_name'])
+ else:
+ og_video = self._og_search_video_url(webpage, name=u'player url')
+ query_str = compat_urllib_parse_urlparse(og_video).query
+ query = compat_urlparse.parse_qs(query_str)
+ api_url = query['play_url'][0].replace('.smil', '')
+ info = json.loads(self._download_webpage(api_url, video_id,
+ u'Downloading video info'))
+ return self._extract_video_info(info)
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 969db7113..8f956571d 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -1,28 +1,110 @@
import re
-import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_str,
- compat_urllib_error,
- compat_urllib_request,
-
+ compat_urllib_parse,
ExtractorError,
)
+def _media_xml_tag(tag):
+ return '{http://search.yahoo.com/mrss/}%s' % tag
class MTVIE(InfoExtractor):
- _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
- _WORKING = False
+ _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+
+ _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+
+ _TESTS = [
+ {
+ u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+ u'file': u'853555.mp4',
+ u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+ u'info_dict': {
+ u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+ u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+ },
+ },
+ {
+ u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+ u'file': u'USCJY1331283.mp4',
+ u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+ u'info_dict': {
+ u'title': u'Everything Has Changed',
+ u'upload_date': u'20130606',
+ u'uploader': u'Taylor Swift',
+ },
+ u'skip': u'VEVO is only available in some countries',
+ },
+ ]
+
+ @staticmethod
+ def _id_from_uri(uri):
+ return uri.split(':')[-1]
+
+ # This was originally implemented for ComedyCentral, but it also works here
+ @staticmethod
+ def _transform_rtmp_url(rtmp_video_url):
+ m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
+ if not m:
+ raise ExtractorError(u'Cannot transform RTMP url')
+ base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+ return base + m.group('finalid')
+
+ def _get_thumbnail_url(self, uri, itemdoc):
+ return 'http://mtv.mtvnimages.com/uri/' + uri
+
+ def _extract_video_url(self, metadataXml):
+ if '/error_country_block.swf' in metadataXml:
+ raise ExtractorError(u'This video is not available from your country.', expected=True)
+ mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
+ renditions = mdoc.findall('.//rendition')
+
+ # For now, always pick the highest quality.
+ rendition = renditions[-1]
+
+ try:
+ _,_,ext = rendition.attrib['type'].partition('/')
+ format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
+ rtmp_video_url = rendition.find('./src').text
+ except KeyError:
+ raise ExtractorError('Invalid rendition field.')
+ video_url = self._transform_rtmp_url(rtmp_video_url)
+ return {'ext': ext, 'url': video_url, 'format': format}
+
+ def _get_video_info(self, itemdoc):
+ uri = itemdoc.find('guid').text
+ video_id = self._id_from_uri(uri)
+ self.report_extraction(video_id)
+ mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+ if 'acceptMethods' not in mediagen_url:
+ mediagen_url += '&acceptMethods=fms'
+ mediagen_page = self._download_webpage(mediagen_url, video_id,
+ u'Downloading video urls')
+ video_info = self._extract_video_url(mediagen_page)
+
+ description_node = itemdoc.find('description')
+ if description_node is not None:
+ description = description_node.text
+ else:
+ description = None
+ video_info.update({'title': itemdoc.find('title').text,
+ 'id': video_id,
+ 'thumbnail': self._get_thumbnail_url(uri, itemdoc),
+ 'description': description,
+ })
+ return video_info
+
+ def _get_videos_info(self, uri):
+ video_id = self._id_from_uri(uri)
+ data = compat_urllib_parse.urlencode({'uri': uri})
+ infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
+ u'Downloading info')
+ idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
+ return [self._get_video_info(item) for item in idoc.findall('.//item')]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- if not mobj.group('proto'):
- url = 'http://' + url
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
@@ -35,46 +117,5 @@ class MTVIE(InfoExtractor):
self.to_screen(u'Vevo video detected: %s' % vevo_id)
return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
- #song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
- # webpage, u'song name', fatal=False)
-
- video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
- webpage, u'title')
-
- mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
- webpage, u'mtvn_uri', fatal=False)
-
- content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
- webpage, u'content id', fatal=False)
-
- videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
- self.report_extraction(video_id)
- request = compat_urllib_request.Request(videogen_url)
- try:
- metadataXml = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
-
- mdoc = xml.etree.ElementTree.fromstring(metadataXml)
- renditions = mdoc.findall('.//rendition')
-
- # For now, always pick the highest quality.
- rendition = renditions[-1]
-
- try:
- _,_,ext = rendition.attrib['type'].partition('/')
- format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
- video_url = rendition.find('./src').text
- except KeyError:
- raise ExtractorError('Invalid rendition field.')
-
- info = {
- 'id': video_id,
- 'url': video_url,
- 'upload_date': None,
- 'title': video_title,
- 'ext': ext,
- 'format': format,
- }
-
- return [info]
+ uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
+ return self._get_videos_info(uri)
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 122b7dd26..0f178905b 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -30,8 +30,7 @@ class NBAIE(InfoExtractor):
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
shortened_video_id = video_id.rpartition('/')[2]
- title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
- webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+ title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')
# It isn't there in the HTML it returns to us
# uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py
index ae9a63e8b..b8e6b3bf9 100644
--- a/youtube_dl/extractor/statigram.py
+++ b/youtube_dl/extractor/statigram.py
@@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'<meta property="og:video:secure_url" content="(.+?)">',
- webpage, u'video URL')
- thumbnail_url = self._html_search_regex(
- r'<meta property="og:image" content="(.+?)" />',
- webpage, u'thumbnail URL', fatal=False)
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, u'title')
@@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor):
return [{
'id': video_id,
- 'url': video_url,
+ 'url': self._og_search_video_url(webpage),
'ext': ext,
'title': title,
- 'thumbnail': thumbnail_url,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id
}]
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index 1dd5e1b68..ec92e589a 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -30,15 +30,6 @@ class TeamcocoIE(InfoExtractor):
self.report_extraction(video_id)
- video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
- webpage, u'title')
-
- thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
- webpage, u'thumbnail', fatal=False)
-
- video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
- webpage, u'description', fatal=False)
-
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
@@ -49,7 +40,7 @@ class TeamcocoIE(InfoExtractor):
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'thumbnail': thumbnail,
- 'description': video_description,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
}]
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py
index 9dd26c163..324bb6231 100644
--- a/youtube_dl/extractor/traileraddict.py
+++ b/youtube_dl/extractor/traileraddict.py
@@ -24,11 +24,8 @@ class TrailerAddictIE(InfoExtractor):
webpage, 'video title').replace(' - Trailer Addict','')
view_count = self._search_regex(r'Views: (.+?)<br />',
webpage, 'Views Count')
- description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
- webpage, 'video description')
- video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />',
- webpage, 'Video id').split('=')[1]
-
+ video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1]
+
info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id))
info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage")
@@ -44,6 +41,6 @@ class TrailerAddictIE(InfoExtractor):
'ext' : ext,
'title' : title,
'thumbnail' : thumbnail_url,
- 'description' : description,
+ 'description' : self._og_search_description(webpage),
'view_count' : view_count,
}]
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index fcaa6ac01..4e404fbf5 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -22,8 +22,6 @@ class TutvIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<meta property="og:title" content="(.*?)">', webpage, u'title')
internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
@@ -36,6 +34,6 @@ class TutvIE(InfoExtractor):
'id': internal_id,
'url': video_url,
'ext': ext,
- 'title': title,
+ 'title': self._og_search_title(webpage),
}
return [info]
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index bdd3522eb..c4ec1f06f 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -27,12 +27,6 @@ class VineIE(InfoExtractor):
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
- video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
- webpage, u'title')
-
- thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
- webpage, u'thumbnail', fatal=False)
-
uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
@@ -40,7 +34,7 @@ class VineIE(InfoExtractor):
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'thumbnail': thumbnail,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader': uploader,
}]
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index 6f022670c..1265639e8 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(embed_page_url, video_id)
# Get the video URL
- video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
- webpage, u'video URL')
+ m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
+ if m_playlist is not None:
+ playlist_url = m_playlist.group('playlist')
+ playlist_page = self._download_webpage(playlist_url, video_id,
+ u'Downloading playlist page')
+ m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page))
+ if len(m_levels) == 0:
+ raise ExtractorError(u'Unable to extract video url')
+ videos = [(int(m.group(1)), m.group(2)) for m in m_levels]
+ (_, video_url) = sorted(videos)[0]
+ video_url = video_url.replace('%252F', '%2F')
+ else:
+ video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+ webpage, u'video URL')
info = {'id': video_id,
'url': video_url,