From 9a1c32dc54fdefcd6b5e03fac1a0dd65383b6f99 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 14 Sep 2013 05:42:00 +0200 Subject: XHamsterIE: Add support for new URL format --- youtube_dl/extractor/xhamster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 88b8b6be0..e50069586 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,7 +11,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' + _VALID_URL = r'(?:http://)?(?P(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html(?:\?.*)?)' _TEST = { u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', u'file': u'1509445.flv', @@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://' + mobj.group('url') webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) -- cgit v1.2.3 From fad84d50fe124df1c620c9bc95bdc4c9e5053e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 11:10:01 +0200 Subject: [googleplus] Fix upload date extraction --- youtube_dl/extractor/googleplus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index f1cd88983..8895ad289 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._html_search_regex('title="Timestamp">(.*?)', + upload_date = self._html_search_regex( + ['title="Timestamp">(.*?)', r'(.+?)'], webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename -- cgit v1.2.3 From 0b7f31184d6a2d87cf7f568c561ff8d017f07bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 11:14:40 +0200 Subject: Now --all-sub is a modifier to --write-sub and --write-auto-sub (closes #1412) For keeping backwards compatibility --all-sub sets --write-sub if --write-auto-sub is not given --- youtube_dl/extractor/subtitles.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 97215f289..90de7de3a 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor): @property def _have_to_download_any_subtitles(self): return any([self._downloader.params.get('writesubtitles', False), - self._downloader.params.get('writeautomaticsub'), - self._downloader.params.get('allsubtitles', False)]) + self._downloader.params.get('writeautomaticsub')]) def _list_available_subtitles(self, video_id, webpage=None): """ outputs the available subtitles for the video """ @@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor): available_subs_list = {} if self._downloader.params.get('writeautomaticsub', False): available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) - if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): + if self._downloader.params.get('writesubtitles', False): available_subs_list.update(self._get_available_subtitles(video_id)) if not available_subs_list: # error, it didn't get the available subtitles -- cgit v1.2.3 From 19e1d35989970831007b7ca5d988fe0454f08a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 14:26:42 +0200 Subject: [mixcloud] Rewrite extractor (fixes #278) --- youtube_dl/extractor/mixcloud.py | 122 ++++++++++++--------------------------- 1 file changed, 38 insertions(+), 84 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 8245b5583..a200dcd74 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,34 +5,27 @@ import socket from .common import InfoExtractor from ..utils import ( compat_http_client, - compat_str, compat_urllib_error, compat_urllib_request, - - ExtractorError, + unified_strdate, ) class MixcloudIE(InfoExtractor): - _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'mixcloud' - def report_download_json(self, file_id): - """Report JSON download.""" - self.to_screen(u'Downloading json') - - def get_urls(self, jsonData, fmt, bitrate='best'): - """Get urls from 'audio_formats' section in json""" - try: - bitrate_list = jsonData[fmt] - if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: - bitrate = max(bitrate_list) # select highest - - url_list = jsonData[fmt][bitrate] - except TypeError: # we have no bitrate info. - url_list = jsonData[fmt] - return url_list + _TEST = { + u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', + u'file': u'dholbach-cryptkeeper.mp3', + u'info_dict': { + u'title': u'Cryptkeeper', + u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', + u'uploader': u'Daniel Holbach', + u'uploader_id': u'dholbach', + u'upload_date': u'20111115', + }, + } def check_urls(self, url_list): """Returns 1st active url from list""" @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor): return None - def _print_formats(self, formats): - print('Available formats:') - for fmt in formats.keys(): - for b in formats[fmt]: - try: - ext = formats[fmt][b][0] - print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) - except TypeError: # we have no bitrate info - ext = formats[fmt][0] - print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) - break - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - # extract uploader & filename from url - uploader = mobj.group(1).decode('utf-8') - file_id = uploader + "-" + mobj.group(2).decode('utf-8') - - # construct API request - file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' - # retrieve .json file with links to files - request = compat_urllib_request.Request(file_url) - try: - self.report_download_json(file_url) - jsonData = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) - - # parse JSON - json_data = json.loads(jsonData) - player_url = json_data['player_swf_url'] - formats = dict(json_data['audio_formats']) - - req_format = self._downloader.params.get('format', None) - - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - if req_format is None or req_format == 'best': - for format_param in formats.keys(): - url_list = self.get_urls(formats, format_param) - # check urls - file_url = self.check_urls(url_list) - if file_url is not None: - break # got it! - else: - if req_format not in formats: - raise ExtractorError(u'Format is not available') - - url_list = self.get_urls(formats, req_format) - file_url = self.check_urls(url_list) - format_param = req_format - return [{ - 'id': file_id.decode('utf-8'), - 'url': file_url.decode('utf-8'), - 'uploader': uploader.decode('utf-8'), - 'upload_date': None, - 'title': json_data['name'], - 'ext': file_url.split('.')[-1].decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': json_data['thumbnail_url'], - 'description': json_data['description'], - 'player_url': player_url.decode('utf-8'), - }] + uploader = mobj.group(1) + cloudcast_name = mobj.group(2) + track_id = '-'.join((uploader, cloudcast_name)) + api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) + webpage = self._download_webpage(url, track_id) + json_data = self._download_webpage(api_url, track_id, + u'Downloading cloudcast info') + info = json.loads(json_data) + + preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') + song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') + template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) + final_song_url = self.check_urls(template_url % i for i in range(30)) + + return { + 'id': track_id, + 'title': info['name'], + 'url': final_song_url, + 'ext': 'mp3', + 'description': info['description'], + 'thumbnail': info['pictures'].get('extra_large'), + 'uploader': info['user']['name'], + 'uploader_id': info['user']['username'], + 'upload_date': unified_strdate(info['created_time']), + 'view_count': info['play_count'], + } -- cgit v1.2.3 From 471a5ee908ee765c1ba1ff6a41051bcf71065064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 14:45:04 +0200 Subject: Set the ext field for each format --- youtube_dl/extractor/archiveorg.py | 7 ++++--- youtube_dl/extractor/dreisat.py | 6 +++--- youtube_dl/extractor/trilulilu.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 7efd1d823..61ce4469a 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor): for fn,fdata in data['files'].items() if 'Video' in fdata['format']] formats.sort(key=lambda fdata: fdata['file_size']) + for f in formats: + f['ext'] = determine_ext(f['url']) info = { '_type': 'video', @@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor): info['thumbnail'] = thumbnail # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = determine_ext(formats[-1]['url']) + info.update(formats[-1]) - return info \ No newline at end of file + return info diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 64b465805..765cb1f37 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor): 'width': int(fe.find('./width').text), 'height': int(fe.find('./height').text), 'url': fe.find('./url').text, + 'ext': determine_ext(fe.find('./url').text), 'filesize': int(fe.find('./filesize').text), 'video_bitrate': int(fe.find('./videoBitrate').text), '3sat_qualityname': fe.find('./quality').text, @@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor): } # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = determine_ext(formats[-1]['url']) + info.update(formats[-1]) - return info \ No newline at end of file + return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index f278951ba..0bf028f61 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor): { 'format': fnode.text, 'url': video_url_template % fnode.text, + 'ext': fnode.text.partition('-')[0] } for fnode in format_doc.findall('./formats/format') @@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor): } # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = formats[-1]['format'].partition('-')[0] + info.update(formats[-1]) return info -- cgit v1.2.3 From 92790f4e542fc3d5f4cc02a647a2695d9175d464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 21:41:49 +0200 Subject: [soundcloud] Add an extractor for users (closes #1426) --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/soundcloud.py | 45 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f9542d2..19d57c2e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -82,7 +82,7 @@ from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE +from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5f3a5540d..29cd5617c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,10 +1,12 @@ import json import re +import itertools from .common import InfoExtractor from ..utils import ( compat_str, compat_urlparse, + compat_urllib_parse, ExtractorError, unified_strdate, @@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None): + def _extract_info_dict(self, info, full_title=None, quiet=False): video_id = info['id'] name = full_title or video_id - self.report_extraction(name) + if quiet == False: + self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: @@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE): 'id': info['id'], 'title': info['title'], } + + +class SoundcloudUserIE(SoundcloudIE): + _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P[^/]+)(/?(tracks/)?)?(\?.*)?$' + IE_NAME = u'soundcloud:user' + + # it's in tests/test_playlists.py + _TEST = None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader = mobj.group('user') + + url = 'http://soundcloud.com/%s/' % uploader + resolv_url = self._resolv_url(url) + user_json = self._download_webpage(resolv_url, uploader, + u'Downloading user info') + user = json.loads(user_json) + + tracks = [] + for i in itertools.count(): + data = compat_urllib_parse.urlencode({'offset': i*50, + 'client_id': self._CLIENT_ID, + }) + tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data + response = self._download_webpage(tracks_url, uploader, + u'Downloading tracks page %s' % (i+1)) + new_tracks = json.loads(response) + tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks) + if len(new_tracks) < 50: + break + + return { + '_type': 'playlist', + 'id': compat_str(user['id']), + 'title': user['username'], + 'entries': tracks, + } -- cgit v1.2.3 From e69ae5b9e74910541e75eea4c8dfc13066f28f65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Sep 2013 12:14:59 +0200 Subject: [youtube] support youtube.googleapis.com/v/* urls (fixes #1425) --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f49665925..e4a2e22bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): ( (?:https?://)? # http(s):// (optional) (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + tube\.majestyc\.net/| + youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ -- cgit v1.2.3 From 5a6fecc3dee35f95f3590a31e51670819db5a1fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Sep 2013 23:30:58 +0200 Subject: Add an extractor for southparkstudios.com (closes #1434) It uses the MTV system --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/southparkstudios.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/southparkstudios.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 19d57c2e9..246f1e8b5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .slashdot import SlashdotIE from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .southparkstudios import SouthParkStudiosIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py new file mode 100644 index 000000000..a5dc754dd --- /dev/null +++ b/youtube_dl/extractor/southparkstudios.py @@ -0,0 +1,34 @@ +import re + +from .mtv import MTVIE, _media_xml_tag + + +class SouthParkStudiosIE(MTVIE): + IE_NAME = u'southparkstudios.com' + _VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P\d+)' + + _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + + _TEST = { + u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', + u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', + u'info_dict': { + u'title': u'Bat Daded', + u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', + }, + } + + # Overwrite MTVIE properties we don't want + _TESTS = [] + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid) -- cgit v1.2.3 From 71c107fc5716dc769860ba6d3731184bde9a6902 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 16 Sep 2013 14:45:14 +0200 Subject: Add FKTV extractor Support for Fernsehkritik-TV (incl. Postecke) --- youtube_dl/extractor/fktv.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/fktv.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py new file mode 100644 index 000000000..239d9df38 --- /dev/null +++ b/youtube_dl/extractor/fktv.py @@ -0,0 +1,58 @@ +import re,random + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + +class FKTVIE(InfoExtractor): + """Information Extractor for Fernsehkritik-TV""" + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P[0-9]+)(?:/.*)?' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + episode = int(mobj.group('ep')) + + server = random.randint(2,4) + video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode + videos = [] + # Download all three parts + for i in range(1,4): + video_id = '%04d%d' % (episode, i) + video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i==1 else '-%d'%i) + video_title = 'Fernsehkritik %d.%d' % (episode, i) + videos.append({ + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': video_title, + 'thumbnail': video_thumbnail + }) + return videos + +class FKTVPosteckeIE(InfoExtractor): + """Information Extractor for Fernsehkritik-TV Postecke""" + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P[0-9]+)(&|$)' + _TEST = { + u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', + u'file': u'0120.flv', + u'md5': u'262f0adbac80317412f7e57b4808e5c4', + u'info_dict': { + u"title": u"Postecke 120" + } + } + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + episode = int(mobj.group('ep')) + + server = random.randint(2,4) + video_id = '%04d' % episode + video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode) + video_title = 'Postecke %d' % episode + return[{ + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': video_title, + }] -- cgit v1.2.3 From 0761d02b0baf20955bd6e4f53568a3bbaa75ab5c Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 16 Sep 2013 14:46:19 +0200 Subject: Add FKTV extractor --- youtube_dl/extractor/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f9542d2..25a8e3cf5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -28,6 +28,10 @@ from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE from .facebook import FacebookIE +from .fktv import ( + FKTVIE, + FKTVPosteckeIE, +) from .flickr import FlickrIE from .francetv import ( PluzzIE, -- cgit v1.2.3 From c4ece785647e58afb4f7b72f492eaf8e714bceba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Sep 2013 19:34:10 +0200 Subject: [ooyala] add support for more type of video urls, like m3u8 manifests. --- youtube_dl/extractor/ooyala.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index b734722d0..01b3637c9 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -35,7 +35,9 @@ class OoyalaIE(InfoExtractor): mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', player, u'mobile player url') mobile_player = self._download_webpage(mobile_url, embedCode) - videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') + videos_info = self._search_regex( + r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', + mobile_player, u'info').replace('\\"','"') videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') videos_info = json.loads(videos_info) videos_more_info =json.loads(videos_more_info) -- cgit v1.2.3 From 4b6462fc1e4306e4a1a5b3613b2cef5b09cc9abe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Sep 2013 19:39:39 +0200 Subject: Add an extractor for Bloomberg (closes #1436) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bloomberg.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 youtube_dl/extractor/bloomberg.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 246f1e8b5..7973a81d0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .arte import ArteTvIE from .auengine import AUEngineIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE +from .bloomberg import BloombergIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .c56 import C56IE diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py new file mode 100644 index 000000000..3666a780b --- /dev/null +++ b/youtube_dl/extractor/bloomberg.py @@ -0,0 +1,27 @@ +import re + +from .common import InfoExtractor + + +class BloombergIE(InfoExtractor): + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?).html' + + _TEST = { + u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', + u'info_dict': { + u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', + u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', + }, + u'params': { + # Requires ffmpeg (m3u8 manifest) + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + ooyala_url = self._og_search_video_url(webpage) + return self.url_result(ooyala_url, ie='Ooyala') -- cgit v1.2.3 From 4dc0ff3ecf2118a0bac128cb8e006e151222e23b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Sep 2013 20:16:52 +0200 Subject: [ooyala] prefer ipad url It has better quality with m3u8 manifests --- youtube_dl/extractor/ooyala.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 01b3637c9..d189a9852 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -22,7 +22,7 @@ class OoyalaIE(InfoExtractor): return {'id': info['embedCode'], 'ext': 'mp4', 'title': unescapeHTML(info['title']), - 'url': info['url'], + 'url': info.get('ipad_url') or info['url'], 'description': unescapeHTML(more_info['description']), 'thumbnail': more_info['promo'], } -- cgit v1.2.3 From e8f8e800978c8845a706ebd3ab31bc1b98a51461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Sep 2013 20:58:36 +0200 Subject: Add an extractor for vice.com (closes #1051) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ooyala.py | 4 ++++ youtube_dl/extractor/vice.py | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 youtube_dl/extractor/vice.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7973a81d0..761575062 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -104,6 +104,7 @@ from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE +from .vice import ViceIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index d189a9852..1f7b4d2e7 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -18,6 +18,10 @@ class OoyalaIE(InfoExtractor): }, } + @staticmethod + def _url_for_embed_code(embed_code): + return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + def _extract_result(self, info, more_info): return {'id': info['embedCode'], 'ext': 'mp4', diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py new file mode 100644 index 000000000..6b93afa50 --- /dev/null +++ b/youtube_dl/extractor/vice.py @@ -0,0 +1,38 @@ +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ExtractorError + + +class ViceIE(InfoExtractor): + _VALID_URL = r'http://www.vice.com/.*?/(?P.+)' + + _TEST = { + u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1', + u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4', + u'info_dict': { + u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + }, + u'params': { + # Requires ffmpeg (m3u8 manifest) + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + try: + ooyala_url = self._og_search_video_url(webpage) + except ExtractorError: + try: + embed_code = self._search_regex( + r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage, + u'ooyala embed code') + ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + except ExtractorError: + raise ExtractorError(u'The page doesn\'t contain a video', expected=True) + return self.url_result(ooyala_url, ie='Ooyala') + -- cgit v1.2.3 From 6ae8ee3f542485b3c790fc09e1136762b1b80c89 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 17 Sep 2013 16:59:12 +0200 Subject: Update 85 signature (Fixes #1449) This is the first signature algorithm to have been parsed automatically, although that only works for HTML5 players for now, and is not yet integrated into master. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4a2e22bc..0c963fd20 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -429,7 +429,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif len(s) == 86: return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] elif len(s) == 85: - return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1] + return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: return s[81:36:-1] + s[0] + s[35:2:-1] elif len(s) == 83: -- cgit v1.2.3 From 4a67aafb7e725c49e7bb3bcc5aea3fb3ae5fb42d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Sep 2013 20:59:55 +0200 Subject: [youtube] Don't search the flash player version for videos with age gate activated --- youtube_dl/extractor/youtube.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0c963fd20..f227e2086 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -783,10 +783,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if self._downloader.params.get('verbose'): s = url_data['s'][0] if age_gate: - player_version = self._search_regex(r'ad3-(.+?)\.swf', - video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND', - 'flash player', fatal=False) - player = 'flash player %s' % player_version + player = 'flash player' else: player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) -- cgit v1.2.3 From 6523223a4c6f8924ac156b3fc2f5519a53b58e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Sep 2013 21:10:57 +0200 Subject: [hotnewhiphop] Fix test case title --- youtube_dl/extractor/hotnewhiphop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ccca1d7e0..3798118a7 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -7,11 +7,11 @@ from .common import InfoExtractor class HotNewHipHopIE(InfoExtractor): _VALID_URL = r'http://www\.hotnewhiphop.com/.*\.(?P.*)\.html' _TEST = { - u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html'", + u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html", u'file': u'1435540.mp3', u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96', u'info_dict': { - u"title": u"Freddie Gibbs Songs - Lay It Down" + u"title": u"Freddie Gibbs - Lay It Down" } } -- cgit v1.2.3 From 5d13df79a51235392bde81274c90e780041e12b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Sep 2013 22:49:43 +0200 Subject: [francetv] Remove Pluzz test Videos expire in 7 days --- youtube_dl/extractor/francetv.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f2b12c884..b8fe82e47 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -34,17 +34,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): IE_NAME = u'pluzz.francetv.fr' _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' - _TEST = { - u'url': u'http://pluzz.francetv.fr/videos/allo_rufo_saison5_,88439064.html', - u'file': u'88439064.mp4', - u'info_dict': { - u'title': u'Allô Rufo', - u'description': u'md5:d909f1ebdf963814b65772aea250400e', - }, - u'params': { - u'skip_download': True, - }, - } + # Can't use tests, videos expire in 7 days def _real_extract(self, url): title = re.match(self._VALID_URL, url).group(1) -- cgit v1.2.3 From 1237c9a3a5ef0abca961f7f2252fde7f9e99db66 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Tue, 17 Sep 2013 06:24:20 +0200 Subject: XHamsterIE: Fix support for new HD video url format and add test (closes PR #1443) --- youtube_dl/extractor/xhamster.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fa759d30c..361619694 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,8 +11,8 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' - _TEST = { + _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' + _TESTS = [{ u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', u'file': u'1509445.flv', u'md5': u'9f48e0e8d58e3076bb236ff412ab62fa', @@ -21,13 +21,24 @@ class XHamsterIE(InfoExtractor): u"uploader_id": u"Ruseful2011", u"title": u"FemaleAgent Shy beauty takes the bait" } - } + }, + { + u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + u'file': u'2221348.flv', + u'md5': u'e767b9475de189320f691f49c679c4c7', + u'info_dict': { + u"upload_date": u"20130914", + u"uploader_id": u"jojo747400", + u"title": u"Britney Spears Sexy Booty" + } + }] def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id + seo = mobj.group('seo') + mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo) webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) -- cgit v1.2.3 From eb03f4dad3ebb0f781e6742b6c1c590506d58e5b Mon Sep 17 00:00:00 2001 From: Ruirize Date: Wed, 18 Sep 2013 15:54:45 +0100 Subject: Added Newgrounds support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/newgrounds.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/newgrounds.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 761575062..e1ec38cf2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -68,6 +68,7 @@ from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE +from .newgrounds import NewgroundsIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py new file mode 100644 index 000000000..d19145a72 --- /dev/null +++ b/youtube_dl/extractor/newgrounds.py @@ -0,0 +1,37 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class NewgroundsIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P\d+)' + _TEST = { + u'url': u'http://www.newgrounds.com/audio/listen/549479', + u'file': u'549479_B7---BusMode.mp3', + u'md5': u'2924d938f60415cd7afbe7ae9042a99e', + u'info_dict': { + u"title": u"B7 - BusMode", + u"uploader" : u"Burn7", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + music_id = mobj.group('id') + webpage = self._download_webpage(url, music_id) + + title = self._html_search_regex(r',"name":"([^"]+)",', webpage, 'music title', flags=re.DOTALL) + uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, 'music uploader', flags=re.DOTALL) + + music_url_json_string = '{"url":"' + self._html_search_regex(r'{"url":"([^"]+)",', webpage, 'music url', flags=re.DOTALL) + '"}' + music_url_json = json.loads(music_url_json_string) + music_url = music_url_json['url'] + + return [{ + 'id': music_id, + 'title': title, + 'url': music_url, + 'uploader': uploader, + 'ext': determine_ext(music_url), + }] -- cgit v1.2.3 From 1ef80b55ddf05d7fe2bcba08c414aa10c524870d Mon Sep 17 00:00:00 2001 From: Ruirize Date: Wed, 18 Sep 2013 16:23:38 +0100 Subject: Fixes test fail Was unaware of --id being passed to test. --- youtube_dl/extractor/newgrounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index d19145a72..f316b9272 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -8,7 +8,7 @@ class NewgroundsIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P\d+)' _TEST = { u'url': u'http://www.newgrounds.com/audio/listen/549479', - u'file': u'549479_B7---BusMode.mp3', + u'file': u'549479.mp3', u'md5': u'2924d938f60415cd7afbe7ae9042a99e', u'info_dict': { u"title": u"B7 - BusMode", -- cgit v1.2.3 From a19413c311e1bd2ffef2705212a8719b7126eef9 Mon Sep 17 00:00:00 2001 From: Ruirize Date: Wed, 18 Sep 2013 17:17:12 +0100 Subject: Changed file hash. --- youtube_dl/extractor/newgrounds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index f316b9272..e66294ade 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -9,7 +9,7 @@ class NewgroundsIE(InfoExtractor): _TEST = { u'url': u'http://www.newgrounds.com/audio/listen/549479', u'file': u'549479.mp3', - u'md5': u'2924d938f60415cd7afbe7ae9042a99e', + u'md5': u'fe6033d297591288fa1c1f780386f07a', u'info_dict': { u"title": u"B7 - BusMode", u"uploader" : u"Burn7", -- cgit v1.2.3 From d0ae9e3a8d807d0466bccc27186c8c2d86215350 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 18 Sep 2013 22:14:43 +0200 Subject: [newgrounds] simplify --- youtube_dl/extractor/newgrounds.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index e66294ade..2ef80bce0 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import determine_ext + class NewgroundsIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P\d+)' _TEST = { @@ -12,7 +13,7 @@ class NewgroundsIE(InfoExtractor): u'md5': u'fe6033d297591288fa1c1f780386f07a', u'info_dict': { u"title": u"B7 - BusMode", - u"uploader" : u"Burn7", + u"uploader": u"Burn7", } } @@ -21,17 +22,17 @@ class NewgroundsIE(InfoExtractor): music_id = mobj.group('id') webpage = self._download_webpage(url, music_id) - title = self._html_search_regex(r',"name":"([^"]+)",', webpage, 'music title', flags=re.DOTALL) - uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, 'music uploader', flags=re.DOTALL) + title = self._html_search_regex(r',"name":"([^"]+)",', webpage, u'music title') + uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, u'music uploader') - music_url_json_string = '{"url":"' + self._html_search_regex(r'{"url":"([^"]+)",', webpage, 'music url', flags=re.DOTALL) + '"}' + music_url_json_string = self._html_search_regex(r'({"url":"[^"]+"),', webpage, u'music url') + '}' music_url_json = json.loads(music_url_json_string) music_url = music_url_json['url'] - return [{ + return { 'id': music_id, - 'title': title, + 'title': title, 'url': music_url, 'uploader': uploader, 'ext': determine_ext(music_url), - }] + } -- cgit v1.2.3 From 71c82637e7add9b437bc6dbe03035d6d8aae82e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Sep 2013 23:00:32 +0200 Subject: [youtube] apply the fix for lists with number of videos multiple of _MAX_RESULTS to user extraction Copied from the playlist extractor. --- youtube_dl/extractor/youtube.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f227e2086..23a8097c5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1005,6 +1005,9 @@ class YoutubeUserIE(InfoExtractor): response = json.loads(page) except ValueError as err: raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + if 'entry' not in response['feed']: + # Number of videos is a multiple of self._MAX_RESULTS + break # Extract video identifiers ids_in_page = [] -- cgit v1.2.3 From c5e743f66f5637fe02fe0b5167fab99a06b903e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Sep 2013 23:32:37 +0200 Subject: [fktv] support videos splitted in any number of parts and some style changes --- youtube_dl/extractor/fktv.py | 57 ++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 18 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 239d9df38..9c89362ef 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -1,37 +1,58 @@ -import re,random +import re +import random +import json from .common import InfoExtractor from ..utils import ( determine_ext, + get_element_by_id, + clean_html, ) + class FKTVIE(InfoExtractor): - """Information Extractor for Fernsehkritik-TV""" + IE_NAME = u'fernsehkritik.tv' _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P[0-9]+)(?:/.*)?' - def _real_extract(self,url): + _TEST = { + u'url': u'http://fernsehkritik.tv/folge-1', + u'file': u'00011.flv', + u'info_dict': { + u'title': u'Folge 1 vom 10. April 2007', + u'description': u'md5:fb4818139c7cfe6907d4b83412a6864f', + }, + } + + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) episode = int(mobj.group('ep')) - - server = random.randint(2,4) + + server = random.randint(2, 4) video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode + start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, + episode) + playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage, + u'playlist', flags=re.DOTALL) + files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) + # TODO: return a single multipart video videos = [] - # Download all three parts - for i in range(1,4): + for i, _ in enumerate(files, 1): video_id = '%04d%d' % (episode, i) - video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i==1 else '-%d'%i) + video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) video_title = 'Fernsehkritik %d.%d' % (episode, i) videos.append({ - 'id': video_id, - 'url': video_url, - 'ext': determine_ext(video_url), - 'title': video_title, + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': clean_html(get_element_by_id('eptitle', start_webpage)), + 'description': clean_html(get_element_by_id('contentlist', start_webpage)), 'thumbnail': video_thumbnail }) return videos + class FKTVPosteckeIE(InfoExtractor): - """Information Extractor for Fernsehkritik-TV Postecke""" + IE_NAME = u'fernsehkritik.tv:postecke' _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P[0-9]+)(&|$)' _TEST = { u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', @@ -42,17 +63,17 @@ class FKTVPosteckeIE(InfoExtractor): } } - def _real_extract(self,url): + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) episode = int(mobj.group('ep')) - - server = random.randint(2,4) + + server = random.randint(2, 4) video_id = '%04d' % episode video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode) video_title = 'Postecke %d' % episode - return[{ + return { 'id': video_id, 'url': video_url, 'ext': determine_ext(video_url), 'title': video_title, - }] + } -- cgit v1.2.3 From bc4b9008981096184739666941e73c8d09623502 Mon Sep 17 00:00:00 2001 From: patrickslin Date: Thu, 19 Sep 2013 21:49:06 -0700 Subject: Unable to decrypt signature length 93 (fixes #1461) --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 23a8097c5..e5f536e6f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -416,7 +416,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _decrypt_signature(self, s): """Turn the encrypted s field into a working signature""" - if len(s) == 92: + if len(s) == 93: + return s[86:29:-1] + s[88] + s[28:5:-1] + elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] -- cgit v1.2.3 From 1a810f0d4e63ba702e49b7404c3f5f74ef716759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 20 Sep 2013 13:05:34 +0200 Subject: [funnyordie] Fix video url extraction --- youtube_dl/extractor/funnyordie.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index f3d86a711..2ccdb7073 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,8 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'type="video/mp4" src="(.*?)"', + video_url = self._search_regex( + [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''], webpage, u'video URL', flags=re.DOTALL) info = { -- cgit v1.2.3 From 38d025b3f0f6f349c36a4531f3b36d7e7553f417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 20 Sep 2013 14:43:16 +0200 Subject: [youtube] add algo for length 91 --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e5f536e6f..47d5cb7ff 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -420,6 +420,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return s[86:29:-1] + s[88] + s[28:5:-1] elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 91: + return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: -- cgit v1.2.3 From 3d60bb96e138ce8221f35b7f9d1e1b28f235083e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 20 Sep 2013 16:55:50 +0200 Subject: Add an extractor for ebaumsworld.com (closes #1462) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ebaumsworld.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/ebaumsworld.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 726c9fa15..c6a55f194 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -24,6 +24,7 @@ from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE +from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py new file mode 100644 index 000000000..f02c6998b --- /dev/null +++ b/youtube_dl/extractor/ebaumsworld.py @@ -0,0 +1,37 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class EbaumsWorldIE(InfoExtractor): + _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P\d+)' + + _TEST = { + u'url': u'http://www.ebaumsworld.com/video/watch/83367677/', + u'file': u'83367677.mp4', + u'info_dict': { + u'title': u'A Giant Python Opens The Door', + u'description': u'This is how nightmares start...', + u'uploader': u'jihadpizza', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + config_xml = self._download_webpage( + 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) + config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + video_url = config.find('file').text + + return { + 'id': video_id, + 'title': config.find('title').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'description': config.find('description').text, + 'thumbnail': config.find('image').text, + 'uploader': config.find('username').text, + } -- cgit v1.2.3 From 0fd49457f5257dbe317c69314ee57a6c485d41a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Sep 2013 10:51:25 +0200 Subject: [southparkstudios] Fix mgid extraction --- youtube_dl/extractor/southparkstudios.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index a5dc754dd..25f799a27 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -14,7 +14,7 @@ class SouthParkStudiosIE(MTVIE): u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', u'info_dict': { u'title': u'Bat Daded', - u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', + u'description': u'Randy finally gets the chance to fight Bat Dad and gets the boys disqualified from the season championships.', }, } @@ -29,6 +29,6 @@ class SouthParkStudiosIE(MTVIE): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', + mgid = self._search_regex(r'data-mgid="(mgid:.*?)"', webpage, u'mgid') return self._get_videos_info(mgid) -- cgit v1.2.3 From 69b227a9bc75a75e9156f05d08c3c69337be64ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Sep 2013 10:58:43 +0200 Subject: [southparkstudios] add support for http://www.southparkstudios.com/full-episodes/* urls (closes #1469) --- youtube_dl/extractor/southparkstudios.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index 25f799a27..1a611d3bb 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -5,7 +5,7 @@ from .mtv import MTVIE, _media_xml_tag class SouthParkStudiosIE(MTVIE): IE_NAME = u'southparkstudios.com' - _VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P\d+)' + _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P.+?)(\?|#|$)' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -23,7 +23,11 @@ class SouthParkStudiosIE(MTVIE): def _get_thumbnail_url(self, uri, itemdoc): search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + else: + return thumb_node.attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) -- cgit v1.2.3 From 3a1d48d6de0159807ff57b2cec6766cbfd400f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Sep 2013 12:15:54 +0200 Subject: [dailymotion] Raise ExtractorError if the dailymotion response reports an error --- youtube_dl/extractor/dailymotion.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 360113f9c..ce7057a26 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -63,6 +63,9 @@ class DailymotionIE(SubtitlesInfoExtractor): info = self._search_regex(r'var info = ({.*?}),$', embed_page, 'video info', flags=re.MULTILINE) info = json.loads(info) + if info.get('error') is not None: + msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] + raise ExtractorError(msg, expected=True) # TODO: support choosing qualities -- cgit v1.2.3 From 39baacc49f323adc639d502d38a016ebd63acd75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Sep 2013 12:45:53 +0200 Subject: [dailymotion] Add an extractor for users (closes #1476) --- youtube_dl/extractor/__init__.py | 6 ++++- youtube_dl/extractor/dailymotion.py | 44 +++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 10 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c6a55f194..949f59a44 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -18,7 +18,11 @@ from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE, DailymotionPlaylistIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) from .daum import DaumIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index ce7057a26..64b89aae8 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -114,28 +114,54 @@ class DailymotionIE(SubtitlesInfoExtractor): class DailymotionPlaylistIE(InfoExtractor): + IE_NAME = u'dailymotion:playlist' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' _MORE_PAGES_INDICATOR = r'' + _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + def _extract_entries(self, id): video_ids = [] - for pagenum in itertools.count(1): - webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), - playlist_id, u'Downloading page %s' % pagenum) + webpage = self._download_webpage(self._PAGE_TEMPLATE % (id, pagenum), + id, u'Downloading page %s' % pagenum) playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break - - entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') for video_id in video_ids] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) + return {'_type': 'playlist', 'id': playlist_id, 'title': get_element_by_id(u'playlist_name', webpage), - 'entries': entries, + 'entries': self._extract_entries(playlist_id), } + + +class DailymotionUserIE(DailymotionPlaylistIE): + IE_NAME = u'dailymotion:user' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P[^/]+)' + _MORE_PAGES_INDICATOR = r'' + _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + webpage = self._download_webpage(url, user) + full_user = self._html_search_regex( + r'(.*?) Date: Sat, 21 Sep 2013 13:50:52 +0200 Subject: [livestream] Fix events extraction (fixes #1467) --- youtube_dl/extractor/livestream.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 309921078..d04da98c8 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -2,7 +2,12 @@ import re import json from .common import InfoExtractor -from ..utils import compat_urllib_parse_urlparse, compat_urlparse +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urlparse, + get_meta_content, + ExtractorError, +) class LivestreamIE(InfoExtractor): @@ -35,8 +40,11 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', - webpage, 'api url') + player = get_meta_content('twitter:player', webpage) + if player is None: + raise ExtractorError('Couldn\'t extract event api url') + api_url = player.replace('/player', '') + api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) info = json.loads(self._download_webpage(api_url, event_name, u'Downloading event info')) videos = [self._extract_video_info(video_data['data']) -- cgit v1.2.3 From e0df6211cc9364f62406b2907fa830847324db53 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 14:19:30 +0200 Subject: Restore accidentally deleted commits That's what happens if you let Windows machines write :( --- youtube_dl/extractor/youtube.py | 603 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 586 insertions(+), 17 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 47d5cb7ff..456d3cb0f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,11 +1,16 @@ # coding: utf-8 +import collections +import itertools +import io import json import netrc import re import socket -import itertools -import xml.etree.ElementTree +import string +import struct +import traceback +import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor @@ -393,6 +398,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._jsplayer_cache = {} + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -413,15 +422,565 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _decrypt_signature(self, s): + def _extract_signature_function(self, video_id, player_url): + id_m = re.match(r'.*-(?P[^.]+)\.(?P[^.]+)$', player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + return self._parse_sig_js(code) + elif player_tpye == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + code = urlh.read() + return self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExctractorError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise ExtractorError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise ExtractorError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + _, pos = u30(pos=pos) # Slot id + type_name_idx, pos = u30(pos=pos) + vindex, pos = u30(pos=pos) + if vindex != 0: + _, pos = read_byte(pos=pos) # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + _, pos = u30(pos=pos) # disp_id + method_idx, pos = u30(pos=pos) + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + _, pos = u30(pos=pos) # slot_id + _, pos = u30(pos=pos) # classi + elif kind == 0x05: # Function + _, pos = u30(pos=pos) # slot_id + function_idx, pos = u30(pos=pos) + methods[function_idx] = multinames[trait_name_idx] + else: + raise ExtractorError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count, pos = u30(pos=pos) + for _c3 in range(metadata_count): + _, pos = u30(pos=pos) + + return (methods, pos) + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count, p = u30() + for class_id in range(class_count): + name_idx, p = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + _, p = u30() # super_name idx + flags, p = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + protected_ns_idx, p = u30() + intrf_count, p = u30() + for _c2 in range(intrf_count): + _, p = u30() + _, p = u30() # iinit + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + if searched_class_id is None: + raise ExtractorError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + _, p = u30() # cinit + trait_count, p = u30() + for _c2 in range(trait_count): + trait_methods, p = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count, p = u30() + for _c in range(script_count): + _, p = u30() # init + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + # Method bodies + method_body_count, p = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx, p = u30() + max_stack, p = u30() + local_count, p = u30() + init_scope_depth, p = u30() + max_scope_depth, p = u30() + code_length, p = u30() + if method_idx in method_idxs: + m = Method(code_tag[p:p+code_length], local_count) + methods[method_idxs[method_idx]] = m + p += code_length + exception_count, p = u30() + for _c2 in range(exception_count): + _, p = u30() # from + _, p = u30() # to + _, p = u30() # target + _, p = u30() # exc_type + _, p = u30() # var_name + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + assert p == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise ExtractorError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + print('Entering function %s(%r)' % (func_name, args)) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + _ = u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if len(s) == 93: - return s[86:29:-1] + s[88] + s[28:5:-1] - elif len(s) == 92: + if jsplayer_url is not None: + try: + if jsplayer_url not in self._jsplayer_cache: + self._jsplayer_cache[jsplayer_url] = self._extract_signature_function( + video_id, jsplayer_url + ) + return self._jsplayer_cache[jsplayer_url]([s]) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb) + + self._downloader.report_warning(u'Warning: Falling back to static signature algorithm') + + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + + if len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] - elif len(s) == 91: - return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: @@ -631,7 +1190,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: @@ -784,21 +1343,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: + encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): - s = url_data['s'][0] if age_gate: - player = 'flash player' + player_version = self._search_regex(r'-(.+)\.swf$', + player_url if player_url else 'NOT FOUND', + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version else: - player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, + player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) - parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(s), parts_sizes, url_data['itag'][0], player)) - encrypted_sig = url_data['s'][0] + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + if age_gate: - signature = self._decrypt_signature_age_gate(encrypted_sig) + jsplayer_url = None else: - signature = self._decrypt_signature(encrypted_sig) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + jsplayer_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' -- cgit v1.2.3 From a7177865b19cdf711f15e01541aee9deae97a56c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 14:48:12 +0200 Subject: Implement more opcodes --- youtube_dl/extractor/youtube.py | 45 ++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 456d3cb0f..b57693ee6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -863,13 +863,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): coder = io.BytesIO(m.code) while True: opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 36: # pushbyte + if opcode == 36: # pushbyte v = struct.unpack('!B', coder.read(1))[0] stack.append(v) elif opcode == 44: # pushstring @@ -895,12 +889,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: res = obj.split(args[0]) stack.append(res) + elif mname == u'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == u'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) elif mname in method_pyfunctions: stack.append(method_pyfunctions[mname](args)) else: raise NotImplementedError( u'Unsupported property %r on %r' % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 79: # callpropvoid + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + u'Unsupported (void) property %r on %r' + % (mname, obj)) elif opcode == 93: # findpropstrict index = u30(coder) mname = multinames[index] @@ -943,6 +966,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): value1 = stack.pop() res = value1 % value2 stack.append(res) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) elif opcode == 214: # setlocal_2 registers[2] = stack.pop() elif opcode == 215: # setlocal_3 -- cgit v1.2.3 From 95dbd2f9907416e86424e4372dbd2593c1699e7d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:10:38 +0200 Subject: Change test target (Verified with node.js) --- youtube_dl/extractor/youtube.py | 1 - 1 file changed, 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b57693ee6..45b593a12 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -857,7 +857,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): m = methods[func_name] def resfunc(args): - print('Entering function %s(%r)' % (func_name, args)) registers = ['(this)'] + list(args) + [None] * m.local_count stack = [] coder = io.BytesIO(m.code) -- cgit v1.2.3 From 8379969834b787708ef5574dc447028c1caf295b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:19:48 +0200 Subject: Prepare signature function caching --- youtube_dl/extractor/youtube.py | 57 +++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 45b593a12..2cd2fdce3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -400,7 +400,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) - self._jsplayer_cache = {} + self._player_cache = {} def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" @@ -423,26 +423,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'RTMP download detected') def _extract_signature_function(self, video_id, player_url): - id_m = re.match(r'.*-(?P[^.]+)\.(?P[^.]+)$', player_url) + id_m = re.match(r'.*-(?P[a-zA-Z0-9]+)\.(?P[a-z]+)$', + player_url) player_type = id_m.group('ext') player_id = id_m.group('id') + # TODO read from filesystem cache + if player_type == 'js': code = self._download_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, jsplayer_id), + note=u'Downloading %s player %s' % (player_type, player_id), errnote=u'Download of %s failed' % player_url) - return self._parse_sig_js(code) + res = self._parse_sig_js(code) elif player_tpye == 'swf': urlh = self._request_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, jsplayer_id), + note=u'Downloading %s player %s' % (player_type, player_id), errnote=u'Download of %s failed' % player_url) code = urlh.read() - return self._parse_sig_swf(code) + res = self._parse_sig_swf(code) else: assert False, 'Invalid player type %r' % player_type + # TODO write cache + + return res + def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, @@ -987,22 +994,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): initial_function = extract_function(u'decipher') return lambda s: initial_function([s]) - def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False): + def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if jsplayer_url is not None: + if player_url is not None: try: - if jsplayer_url not in self._jsplayer_cache: - self._jsplayer_cache[jsplayer_url] = self._extract_signature_function( - video_id, jsplayer_url + if player_url not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url ) - return self._jsplayer_cache[jsplayer_url]([s]) + self._player_cache[player_url] = func + return self._player_cache[player_url](s) except Exception as e: tb = traceback.format_exc() - self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb) + self._downloader.report_warning( + u'Automatic signature extraction failed: ' + tb) - self._downloader.report_warning(u'Warning: Falling back to static signature algorithm') + self._downloader.report_warning( + u'Warning: Falling back to static signature algorithm') + return self._static_decrypt_signature(s) + def _static_decrypt_signature(self, s): if age_gate: # The videos with age protection use another player, so the # algorithms can be different. @@ -1376,12 +1388,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): if age_gate: - player_version = self._search_regex(r'-(.+)\.swf$', - player_url if player_url else 'NOT FOUND', + player_version = self._search_regex( + r'-(.+)\.swf$', + player_url if player_url else None, 'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: - player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage, + player_version = self._search_regex( + r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) player_desc = u'html5 player %s' % player_version @@ -1389,15 +1403,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) - if age_gate: - jsplayer_url = None - else: + if not age_gate: jsplayer_url_json = self._search_regex( r'"assets":.+?"js":\s*("[^"]+")', video_webpage, u'JS player URL') - jsplayer_url = json.loads(jsplayer_url_json) + player_url = json.loads(jsplayer_url_json) - signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate) + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' -- cgit v1.2.3 From ba552f542f674d35de21d48978f211b8db3f0ff8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:32:37 +0200 Subject: Use reader instead of indexing --- youtube_dl/extractor/youtube.py | 262 +++++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 140 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2cd2fdce3..09bd423f5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -590,99 +590,83 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): for tag_code, tag in extract_tags(content) if tag_code == 82) p = code_tag.index(b'\0', 4) + 1 + code_reader = io.BytesIO(code_tag[p:]) # Parse ABC (AVM2 ByteCode) - def read_int(data=None, pos=None): - if hasattr(data, 'read'): - assert pos is None - - res = 0 - shift = 0 - for _ in range(5): - buf = data.read(1) - assert len(buf) == 1 - b = struct.unpack('> 4 methods = {} if kind in [0x00, 0x06]: # Slot or Const - _, pos = u30(pos=pos) # Slot id - type_name_idx, pos = u30(pos=pos) - vindex, pos = u30(pos=pos) + _ = u30() # Slot id + type_name_idx = u30() + vindex = u30() if vindex != 0: - _, pos = read_byte(pos=pos) # vkind + _ = read_byte() # vkind elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - _, pos = u30(pos=pos) # disp_id - method_idx, pos = u30(pos=pos) + _ = u30() # disp_id + method_idx = u30() methods[multinames[trait_name_idx]] = method_idx elif kind == 0x04: # Class - _, pos = u30(pos=pos) # slot_id - _, pos = u30(pos=pos) # classi + _ = u30() # slot_id + _ = u30() # classi elif kind == 0x05: # Function - _, pos = u30(pos=pos) # slot_id - function_idx, pos = u30(pos=pos) + _ = u30() # slot_id + function_idx = u30() methods[function_idx] = multinames[trait_name_idx] else: raise ExtractorError(u'Unsupported trait kind %d' % kind) if attrs & 0x4 != 0: # Metadata present - metadata_count, pos = u30(pos=pos) + metadata_count = u30() for _c3 in range(metadata_count): - _, pos = u30(pos=pos) + _ = u30() - return (methods, pos) + return methods # Classes TARGET_CLASSNAME = u'SignatureDecipher' searched_idx = multinames.index(TARGET_CLASSNAME) searched_class_id = None - class_count, p = u30() + class_count = u30() for class_id in range(class_count): - name_idx, p = u30() + name_idx = u30() if name_idx == searched_idx: # We found the class we're looking for! searched_class_id = class_id - _, p = u30() # super_name idx - flags, p = read_byte() + _ = u30() # super_name idx + flags = read_byte() if flags & 0x08 != 0: # Protected namespace is present - protected_ns_idx, p = u30() - intrf_count, p = u30() + protected_ns_idx = u30() + intrf_count = u30() for _c2 in range(intrf_count): - _, p = u30() - _, p = u30() # iinit - trait_count, p = u30() + _ = u30() + _ = u30() # iinit + trait_count = u30() for _c2 in range(trait_count): - _, p = parse_traits_info() + _ = parse_traits_info() if searched_class_id is None: raise ExtractorError(u'Target class %r not found' % @@ -807,10 +789,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): method_names = {} method_idxs = {} for class_id in range(class_count): - _, p = u30() # cinit - trait_count, p = u30() + _ = u30() # cinit + trait_count = u30() for _c2 in range(trait_count): - trait_methods, p = parse_traits_info() + trait_methods = parse_traits_info() if class_id == searched_class_id: method_names.update(trait_methods.items()) method_idxs.update(dict( @@ -818,40 +800,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): for name, idx in trait_methods.items())) # Scripts - script_count, p = u30() + script_count = u30() for _c in range(script_count): - _, p = u30() # init - trait_count, p = u30() + _ = u30() # init + trait_count = u30() for _c2 in range(trait_count): - _, p = parse_traits_info() + _ = parse_traits_info() # Method bodies - method_body_count, p = u30() + method_body_count = u30() Method = collections.namedtuple('Method', ['code', 'local_count']) methods = {} for _c in range(method_body_count): - method_idx, p = u30() - max_stack, p = u30() - local_count, p = u30() - init_scope_depth, p = u30() - max_scope_depth, p = u30() - code_length, p = u30() + method_idx = u30() + max_stack = u30() + local_count = u30() + init_scope_depth = u30() + max_scope_depth = u30() + code_length = u30() + code = read_bytes(code_length) if method_idx in method_idxs: - m = Method(code_tag[p:p+code_length], local_count) + m = Method(code, local_count) methods[method_idxs[method_idx]] = m - p += code_length - exception_count, p = u30() + exception_count = u30() for _c2 in range(exception_count): - _, p = u30() # from - _, p = u30() # to - _, p = u30() # target - _, p = u30() # exc_type - _, p = u30() # var_name - trait_count, p = u30() + _ = u30() # from + _ = u30() # to + _ = u30() # target + _ = u30() # exc_type + _ = u30() # var_name + trait_count = u30() for _c2 in range(trait_count): - _, p = parse_traits_info() + _ = parse_traits_info() - assert p == len(code_tag) + assert p + code_reader.tell() == len(code_tag) assert len(methods) == len(method_idxs) method_pyfunctions = {} -- cgit v1.2.3 From 2f2ffea9cad7d30165a0171bf6e662bef2182ab4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:34:29 +0200 Subject: Clarify a couple of calls --- youtube_dl/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 09bd423f5..5c0ea2e43 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -641,7 +641,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return res # minor_version + major_version - _ = read_bytes(4) + _ = read_bytes(2 + 2) # Constant pool int_count = u30() @@ -994,9 +994,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self._downloader.report_warning( u'Warning: Falling back to static signature algorithm') - return self._static_decrypt_signature(s) + return self._static_decrypt_signature( + s, video_id, player_url, age_gate) - def _static_decrypt_signature(self, s): + def _static_decrypt_signature(self, s, video_id, player_url, age_gate): if age_gate: # The videos with age protection use another player, so the # algorithms can be different. -- cgit v1.2.3 From 4a2080e4077e9e12c860d82a4d2eebc75c1ea54b Mon Sep 17 00:00:00 2001 From: tewe Date: Sun, 15 Sep 2013 21:58:49 +0200 Subject: [youku] better error handling blocked videos used to cause death by TypeError, now we report what the server says --- youtube_dl/extractor/youku.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 996d38478..00fa2ccb5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -66,6 +66,12 @@ class YoukuIE(InfoExtractor): self.report_extraction(video_id) try: config = json.loads(jsondata) + error_code = config['data'][0].get('error_code') + if error_code: + # -8 means blocked outside China. + error = config['data'][0].get('error') # Chinese and English, separated by newline. + raise ExtractorError(error or u'Server reported error %i' % error_code, + expected=True) video_title = config['data'][0]['title'] seed = config['data'][0]['seed'] @@ -89,6 +95,7 @@ class YoukuIE(InfoExtractor): fileid = config['data'][0]['streamfileids'][format] keys = [s['k'] for s in config['data'][0]['segs'][format]] + # segs is usually a dictionary, but an empty *list* if an error occured. except (UnicodeDecodeError, ValueError, KeyError): raise ExtractorError(u'Unable to extract info section') -- cgit v1.2.3 From c4417ddb611e14b81fe56b6b32964c5802faf554 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 00:35:03 +0200 Subject: [youtube] Add filesystem signature cache --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5c0ea2e43..63f59ae8f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -4,8 +4,10 @@ import collections import itertools import io import json -import netrc +import operator +import os.path import re +import shutil import socket import string import struct @@ -422,13 +424,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _extract_signature_function(self, video_id, player_url): - id_m = re.match(r'.*-(?P[a-zA-Z0-9]+)\.(?P[a-z]+)$', + def _extract_signature_function(self, video_id, player_url, slen): + id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)\.(?P[a-z]+)$', player_url) player_type = id_m.group('ext') player_id = id_m.group('id') - # TODO read from filesystem cache + # Read from filesystem cache + func_id = '%s_%s_%d' % (player_type, player_id, slen) + assert os.path.basename(func_id) == func_id + cache_dir = self.downloader.params.get('cachedir', + u'~/.youtube-dl/cache') + + if cache_dir is not False: + cache_fn = os.path.join(os.path.expanduser(cache_dir), + u'youtube-sigfuncs', + func_id + '.json') + try: + with io.open(cache_fn, '', encoding='utf-8') as cachef: + cache_spec = json.load(cachef) + return lambda s: u''.join(s[i] for i in cache_spec) + except OSError: + pass # No cache available if player_type == 'js': code = self._download_webpage( @@ -436,7 +453,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): note=u'Downloading %s player %s' % (player_type, player_id), errnote=u'Download of %s failed' % player_url) res = self._parse_sig_js(code) - elif player_tpye == 'swf': + elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, note=u'Downloading %s player %s' % (player_type, player_id), @@ -446,7 +463,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: assert False, 'Invalid player type %r' % player_type - # TODO write cache + if cache_dir is not False: + cache_res = res(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + shutil.makedirs(os.path.dirname(cache_fn)) + write_json_file(cache_spec, cache_fn) return res @@ -983,7 +1004,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): try: if player_url not in self._player_cache: func = self._extract_signature_function( - video_id, player_url + video_id, player_url, len(s) ) self._player_cache[player_url] = func return self._player_cache[player_url](s) -- cgit v1.2.3 From edf3e38ebd6c5db21585dc7b6384e325e6cfb540 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:30:02 +0200 Subject: [youtube] Improve cache and add an option to print the extracted signatures --- youtube_dl/extractor/youtube.py | 69 ++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 63f59ae8f..4200f987e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,13 +1,13 @@ # coding: utf-8 import collections +import errno import itertools import io import json import operator import os.path import re -import shutil import socket import string import struct @@ -17,6 +17,7 @@ import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_chr, compat_http_client, compat_parse_qs, compat_urllib_error, @@ -30,6 +31,7 @@ from ..utils import ( unescapeHTML, unified_strdate, orderedSet, + write_json_file, ) class YoutubeBaseInfoExtractor(InfoExtractor): @@ -433,18 +435,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Read from filesystem cache func_id = '%s_%s_%d' % (player_type, player_id, slen) assert os.path.basename(func_id) == func_id - cache_dir = self.downloader.params.get('cachedir', - u'~/.youtube-dl/cache') + cache_dir = self._downloader.params.get('cachedir', + u'~/.youtube-dl/cache') - if cache_dir is not False: + if cache_dir != u'NONE': cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', func_id + '.json') try: - with io.open(cache_fn, '', encoding='utf-8') as cachef: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: cache_spec = json.load(cachef) return lambda s: u''.join(s[i] for i in cache_spec) - except OSError: + except IOError: pass # No cache available if player_type == 'js': @@ -464,13 +466,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): assert False, 'Invalid player type %r' % player_type if cache_dir is not False: - cache_res = res(map(compat_chr, range(slen))) - cache_spec = [ord(c) for c in cache_res] - shutil.makedirs(os.path.dirname(cache_fn)) - write_json_file(cache_spec, cache_fn) + try: + cache_res = res(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + try: + os.makedirs(os.path.dirname(cache_fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + write_json_file(cache_spec, cache_fn) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Writing cache to %r failed: %s' % (cache_fn, tb)) return res + def _print_sig_code(self, func, slen): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = u'' if start == 0 else str(start) + ends = u':%d' % (end+step) + steps = u'' if step == 1 else (':%d' % step) + return u's[%s%s%s]' % (starts, ends, steps) + + step = None + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield u's[%d]' % prev + if step is None: + yield u's[%d]' % i + else: + yield _genslice(start, i, step) + + cache_res = func(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + expr_code = u' + '.join(gen_sig_code(cache_spec)) + code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) + self.to_screen(u'Extracted signature:\n' + code) + def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, @@ -1007,7 +1051,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id, player_url, len(s) ) self._player_cache[player_url] = func - return self._player_cache[player_url](s) + func = self._player_cache[player_url] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, len(s)) + return func(s) except Exception as e: tb = traceback.format_exc() self._downloader.report_warning( -- cgit v1.2.3 From 4ba146f35dd797e9d78636cb3cffabb100575240 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:31:25 +0200 Subject: Update static signatures --- youtube_dl/extractor/youtube.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4200f987e..8245349b2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1072,8 +1072,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if len(s) == 86: return s[2:63] + s[82] + s[64:82] + s[63] - if len(s) == 92: + if len(s) == 93: + return s[86:29:-1] + s[88] + s[28:5:-1] + elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 91: + return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: -- cgit v1.2.3 From 0ca96d48c7f74e122be70b71bb5fe38f4b143cb0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:37:23 +0200 Subject: [youtube] Improve source code quality --- youtube_dl/extractor/youtube.py | 104 ++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 51 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8245349b2..a9bfc455f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,16 +2,16 @@ import collections import errno -import itertools import io +import itertools import json -import operator import os.path import re import socket import string import struct import traceback +import xml.etree.ElementTree import zlib from .common import InfoExtractor, SearchInfoExtractor @@ -475,7 +475,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if ose.errno != errno.EEXIST: raise write_json_file(cache_spec, cache_fn) - except Exception as e: + except Exception: tb = traceback.format_exc() self._downloader.report_warning( u'Writing cache to %r failed: %s' % (cache_fn, tb)) @@ -491,6 +491,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return u's[%s%s%s]' % (starts, ends, steps) step = None + start = '(Never used)' # Quelch pyflakes warnings - start will be + # set as soon as step is set for i, prev in zip(idxs[1:], idxs[:-1]): if step is not None: if i - prev == step: @@ -527,7 +529,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def interpret_statement(stmt, local_vars, allow_recursion=20): if allow_recursion < 0: - raise ExctractorError(u'Recursion limit reached') + raise ExtractorError(u'Recursion limit reached') if stmt.startswith(u'var '): stmt = stmt[len(u'var '):] @@ -685,7 +687,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): v = - ((v ^ 0xffffffff) + 1) return v - def string(reader=None): + def read_string(reader=None): if reader is None: reader = code_reader slen = u30(reader) @@ -706,31 +708,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return res # minor_version + major_version - _ = read_bytes(2 + 2) + read_bytes(2 + 2) # Constant pool int_count = u30() for _c in range(1, int_count): - _ = s32() + s32() uint_count = u30() for _c in range(1, uint_count): - _ = u32() + u32() double_count = u30() - _ = read_bytes((double_count-1) * 8) + read_bytes((double_count-1) * 8) string_count = u30() constant_strings = [u''] for _c in range(1, string_count): - s = string() + s = read_string() constant_strings.append(s) namespace_count = u30() for _c in range(1, namespace_count): - _ = read_bytes(1) # kind - _ = u30() # name + read_bytes(1) # kind + u30() # name ns_set_count = u30() for _c in range(1, ns_set_count): count = u30() for _c2 in range(count): - _ = u30() + u30() multiname_count = u30() MULTINAME_SIZES = { 0x07: 2, # QName @@ -749,13 +751,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): kind = u30() assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind if kind == 0x07: - namespace_idx = u30() + u30() # namespace_idx name_idx = u30() multinames.append(constant_strings[name_idx]) else: multinames.append('[MULTINAME kind: %d]' % kind) for _c2 in range(MULTINAME_SIZES[kind]): - _ = u30() + u30() # Methods method_count = u30() @@ -765,32 +767,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): method_infos = [] for method_id in range(method_count): param_count = u30() - _ = u30() # return type + u30() # return type for _ in range(param_count): - _ = u30() # param type - _ = u30() # name index (always 0 for youtube) + u30() # param type + u30() # name index (always 0 for youtube) flags = read_byte() if flags & 0x08 != 0: # Options present option_count = u30() for c in range(option_count): - _ = u30() # val - _ = read_bytes(1) # kind + u30() # val + read_bytes(1) # kind if flags & 0x80 != 0: # Param names present for _ in range(param_count): - _ = u30() # param name + u30() # param name mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) method_infos.append(mi) # Metadata metadata_count = u30() for _c in range(metadata_count): - _ = u30() # name + u30() # name item_count = u30() for _c2 in range(item_count): - _ = u30() # key - _ = u30() # value + u30() # key + u30() # value def parse_traits_info(): trait_name_idx = u30() @@ -799,20 +801,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): attrs = kind_full >> 4 methods = {} if kind in [0x00, 0x06]: # Slot or Const - _ = u30() # Slot id - type_name_idx = u30() + u30() # Slot id + u30() # type_name_idx vindex = u30() if vindex != 0: - _ = read_byte() # vkind + read_byte() # vkind elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - _ = u30() # disp_id + u30() # disp_id method_idx = u30() methods[multinames[trait_name_idx]] = method_idx elif kind == 0x04: # Class - _ = u30() # slot_id - _ = u30() # classi + u30() # slot_id + u30() # classi elif kind == 0x05: # Function - _ = u30() # slot_id + u30() # slot_id function_idx = u30() methods[function_idx] = multinames[trait_name_idx] else: @@ -821,7 +823,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if attrs & 0x4 != 0: # Metadata present metadata_count = u30() for _c3 in range(metadata_count): - _ = u30() + u30() # metadata index return methods @@ -835,17 +837,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if name_idx == searched_idx: # We found the class we're looking for! searched_class_id = class_id - _ = u30() # super_name idx + u30() # super_name idx flags = read_byte() if flags & 0x08 != 0: # Protected namespace is present - protected_ns_idx = u30() + u30() # protected_ns_idx intrf_count = u30() for _c2 in range(intrf_count): - _ = u30() - _ = u30() # iinit + u30() + u30() # iinit trait_count = u30() for _c2 in range(trait_count): - _ = parse_traits_info() + parse_traits_info() if searched_class_id is None: raise ExtractorError(u'Target class %r not found' % @@ -854,7 +856,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): method_names = {} method_idxs = {} for class_id in range(class_count): - _ = u30() # cinit + u30() # cinit trait_count = u30() for _c2 in range(trait_count): trait_methods = parse_traits_info() @@ -867,10 +869,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Scripts script_count = u30() for _c in range(script_count): - _ = u30() # init + u30() # init trait_count = u30() for _c2 in range(trait_count): - _ = parse_traits_info() + parse_traits_info() # Method bodies method_body_count = u30() @@ -878,10 +880,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): methods = {} for _c in range(method_body_count): method_idx = u30() - max_stack = u30() + u30() # max_stack local_count = u30() - init_scope_depth = u30() - max_scope_depth = u30() + u30() # init_scope_depth + u30() # max_scope_depth code_length = u30() code = read_bytes(code_length) if method_idx in method_idxs: @@ -889,14 +891,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): methods[method_idxs[method_idx]] = m exception_count = u30() for _c2 in range(exception_count): - _ = u30() # from - _ = u30() # to - _ = u30() # target - _ = u30() # exc_type - _ = u30() # var_name + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name trait_count = u30() for _c2 in range(trait_count): - _ = parse_traits_info() + parse_traits_info() assert p + code_reader.tell() == len(code_tag) assert len(methods) == len(method_idxs) @@ -1011,7 +1013,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): assert isinstance(obj, list) stack.append(obj[idx]) elif opcode == 128: # coerce - _ = u30(coder) + u30(coder) elif opcode == 133: # coerce_s assert isinstance(stack[-1], (type(None), compat_str)) elif opcode == 164: # modulo @@ -1055,7 +1057,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if self._downloader.params.get('youtube_print_sig_code'): self._print_sig_code(func, len(s)) return func(s) - except Exception as e: + except Exception: tb = traceback.format_exc() self._downloader.report_warning( u'Automatic signature extraction failed: ' + tb) -- cgit v1.2.3 From f8061589e66f12f6c2ffac3d7bfba2a7ac0294d5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:50:12 +0200 Subject: [youtube] Actually pass in cachedir option --- youtube_dl/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a9bfc455f..2dd2db673 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -438,7 +438,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): cache_dir = self._downloader.params.get('cachedir', u'~/.youtube-dl/cache') - if cache_dir != u'NONE': + cache_enabled = cache_dir != u'NONE' + if cache_enabled: cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', func_id + '.json') @@ -465,7 +466,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: assert False, 'Invalid player type %r' % player_type - if cache_dir is not False: + if cache_enabled: try: cache_res = res(map(compat_chr, range(slen))) cache_spec = [ord(c) for c in cache_res] @@ -515,7 +516,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): cache_spec = [ord(c) for c in cache_res] expr_code = u' + '.join(gen_sig_code(cache_spec)) code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) - self.to_screen(u'Extracted signature:\n' + code) + self.to_screen(u'Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( -- cgit v1.2.3 From 13dc64ce741520ba54ba9fff0ab1a3ac4e5c43a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 11:17:21 +0200 Subject: [youtube] Remove _decrypt_signature_age_gate --- youtube_dl/extractor/youtube.py | 9 --------- 1 file changed, 9 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2dd2db673..56ad33fdc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1109,15 +1109,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _decrypt_signature_age_gate(self, s): - # The videos with age protection use another player, so the algorithms - # can be different. - if len(s) == 86: - return s[2:63] + s[82] + s[64:82] + s[63] - else: - # Fallback to the other algortihms - return self._decrypt_signature(s) - def _get_available_subtitles(self, video_id): try: sub_list = self._download_webpage( -- cgit v1.2.3 From bdde940e90320e350bd96df621ee7e32641e1eca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 12:17:42 +0200 Subject: [youtube] Improve flash player URL handling --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 56ad33fdc..888907c93 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1437,10 +1437,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): if age_gate: - player_version = self._search_regex( - r'-(.+)\.swf$', - player_url if player_url else None, - 'flash player', fatal=False) + if player_url is None: + player_version = 'unknown' + else: + player_version = self._search_regex( + r'-(.+)\.swf$', player_url, + u'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( -- cgit v1.2.3 From d2d8f895310be7fa302ba7755c60d5948866fcaa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 12:18:10 +0200 Subject: Do not warn if fallback is without alternatives (because we did not get the flash player URL) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 888907c93..780690ed0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1063,8 +1063,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self._downloader.report_warning( u'Automatic signature extraction failed: ' + tb) - self._downloader.report_warning( - u'Warning: Falling back to static signature algorithm') + self._downloader.report_warning( + u'Warning: Falling back to static signature algorithm') return self._static_decrypt_signature( s, video_id, player_url, age_gate) -- cgit v1.2.3 From c705320f485cd962827fce464a93993569e3173f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 12:18:16 +0200 Subject: Correct test strings --- youtube_dl/extractor/youtube.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 780690ed0..049da2f91 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -468,7 +468,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if cache_enabled: try: - cache_res = res(map(compat_chr, range(slen))) + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] try: os.makedirs(os.path.dirname(cache_fn)) @@ -512,7 +513,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: yield _genslice(start, i, step) - cache_res = func(map(compat_chr, range(slen))) + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = u' + '.join(gen_sig_code(cache_spec)) code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) -- cgit v1.2.3 From 81ec7c7901ddfe9366cf1af010eb31b906dcfce0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 23 Sep 2013 11:24:10 +0200 Subject: [facebook] Allow untitled videos (Fixes #1484) --- youtube_dl/extractor/facebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index beaa5b4bd..9d1bc0751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -106,8 +106,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._html_search_regex('

([^<]+)

', - webpage, u'title') + video_title = self._html_search_regex( + r'

([^<]*)

', webpage, u'title') info = { 'id': video_id, -- cgit v1.2.3 From a825f33030f189a37b1c3517ed1770a8b9e274fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 23 Sep 2013 21:28:33 +0200 Subject: [francetv] Add an extractor for France2 --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/francetv.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 949f59a44..65aacebb3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -42,6 +42,7 @@ from .flickr import FlickrIE from .francetv import ( PluzzIE, FranceTvInfoIE, + France2IE, ) from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index b8fe82e47..5e915bc03 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -65,3 +65,25 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, page_title) video_id = self._search_regex(r'id-video=(\d+?)"', webpage, u'video id') return self._extract_video(video_id) + + +class France2IE(FranceTVBaseInfoExtractor): + IE_NAME = u'france2.fr' + _VALID_URL = r'https?://www\.france2\.fr/emissions/.*?/videos/(?P\d+)' + + _TEST = { + u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + u'file': u'75540104.mp4', + u'info_dict': { + u'title': u'13h15, le samedi...', + u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + return self._extract_video(video_id) -- cgit v1.2.3 From 5b333c1ce6287badd89dacdd280a3876a09dcbcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 23 Sep 2013 21:41:54 +0200 Subject: [francetv] Add an extractor for Generation Quoi (closes #1475) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/francetv.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 65aacebb3..d1b7e5f99 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -43,6 +43,7 @@ from .francetv import ( PluzzIE, FranceTvInfoIE, France2IE, + GenerationQuoiIE ) from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 5e915bc03..b1530e549 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,6 +1,7 @@ # encoding: utf-8 import re import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( @@ -87,3 +88,30 @@ class France2IE(FranceTVBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') return self._extract_video(video_id) + + +class GenerationQuoiIE(InfoExtractor): + IE_NAME = u'http://generation-quoi.france2.fr' + _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P.*)(\?|$)' + + _TEST = { + u'url': u'http://generation-quoi.france2.fr/portrait/garde-a-vous', + u'file': u'k7FJX8VBcvvLmX4wA5Q.mp4', + u'info_dict': { + u'title': u'Génération Quoi - Garde à Vous', + u'uploader': u'Génération Quoi', + }, + u'params': { + # It uses Dailymotion + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % name) + info_json = self._download_webpage(info_url, name) + info = json.loads(info_json) + return self.url_result('http://www.dailymotion.com/video/%s' % info['id'], + ie='Dailymotion') -- cgit v1.2.3 From 6f56389b8836301fc64f849e43ebd05043c0a66d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 24 Sep 2013 21:02:00 +0200 Subject: [youtube] update algos for length 86 and 84 (fixes #1494) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 47d5cb7ff..ec1cf8d30 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -431,11 +431,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] + return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[:16][::-1] elif len(s) == 85: return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: - return s[81:36:-1] + s[0] + s[35:2:-1] + return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: -- cgit v1.2.3 From bb0eee71e7b7519321694f3d68875bbd71affeb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 24 Sep 2013 21:04:13 +0200 Subject: [youtube] Update one of the test's description --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec1cf8d30..606ed21c9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -352,7 +352,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", + u"description": u"md5:bdac09887d209a4ed54b8f76b2bdaa8b", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } -- cgit v1.2.3 From c3c88a2664595fd62898e44f8fc93c84e6d3c5a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:04:43 +0200 Subject: Allow opts.cachedir == None to disable cache --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 049da2f91..a6eefdf4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -438,7 +438,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): cache_dir = self._downloader.params.get('cachedir', u'~/.youtube-dl/cache') - cache_enabled = cache_dir != u'NONE' + cache_enabled = cache_dir is not None if cache_enabled: cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', -- cgit v1.2.3 From e35e4ddc9a4605a63a06c5bb12055bfceacb50b8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:18:03 +0200 Subject: Fix output of --youtube-print-sig-code when counting down to 0 --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a6eefdf4e..148b20160 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -488,8 +488,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def gen_sig_code(idxs): def _genslice(start, end, step): starts = u'' if start == 0 else str(start) - ends = u':%d' % (end+step) - steps = u'' if step == 1 else (':%d' % step) + ends = (u':%d' % (end+step)) if end + step >= 0 else u':' + steps = u'' if step == 1 else (u':%d' % step) return u's[%s%s%s]' % (starts, ends, steps) step = None -- cgit v1.2.3 From f2c327fd39d10115573d709f94f20721a80895fb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:20:42 +0200 Subject: Fix 86 signature (#1494) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 148b20160..e883a2c54 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1092,7 +1092,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] + return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] elif len(s) == 85: return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: -- cgit v1.2.3 From e80d8610645232583b5aec93fcd446fa67152d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 24 Sep 2013 21:38:37 +0200 Subject: Revert "[southparkstudios] Fix mgid extraction" This reverts commit 0fd49457f5257dbe317c69314ee57a6c485d41a3. It seems that the redesign was temporary. --- youtube_dl/extractor/southparkstudios.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index 1a611d3bb..b1e96b679 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -14,7 +14,7 @@ class SouthParkStudiosIE(MTVIE): u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', u'info_dict': { u'title': u'Bat Daded', - u'description': u'Randy finally gets the chance to fight Bat Dad and gets the boys disqualified from the season championships.', + u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', }, } @@ -33,6 +33,6 @@ class SouthParkStudiosIE(MTVIE): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - mgid = self._search_regex(r'data-mgid="(mgid:.*?)"', + mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', webpage, u'mgid') return self._get_videos_info(mgid) -- cgit v1.2.3 From 592882aa9f889432b07ad487f1a4228c9ae12818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 26 Sep 2013 13:53:57 +0200 Subject: [brightcove] Support videos that only provide flv versions (fixes #1504) Moved the test from generic.py to brightcove.py --- youtube_dl/extractor/brightcove.py | 62 ++++++++++++++++++++++++++++++-------- youtube_dl/extractor/generic.py | 11 ------- 2 files changed, 49 insertions(+), 24 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 71e3c7883..859baae75 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import re import json import xml.etree.ElementTree @@ -7,15 +9,37 @@ from ..utils import ( compat_urllib_parse, find_xpath_attr, compat_urlparse, + + ExtractorError, ) class BrightcoveIE(InfoExtractor): _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' - - # There is a test for Brigtcove in GenericIE, that way we test both the download - # and the detection of videos, and we don't have to find an URL that is always valid + + _TESTS = [ + { + u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', + u'file': u'2371591881001.mp4', + u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'note': u'Test Brightcove downloads and detection in GenericIE', + u'info_dict': { + u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + u'uploader': u'8TV', + u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', + } + }, + { + u'url': u'http://medianetwork.oracle.com/video/player/1785452137001', + u'file': u'1785452137001.flv', + u'info_dict': { + u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.', + u'uploader': u'Oracle', + }, + }, + ] @classmethod def _build_brighcove_url(cls, object_str): @@ -72,15 +96,27 @@ class BrightcoveIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): - renditions = video_info['renditions'] - renditions = sorted(renditions, key=lambda r: r['size']) - best_format = renditions[-1] + info = { + 'id': video_info['id'], + 'title': video_info['displayName'], + 'description': video_info.get('shortDescription'), + 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), + 'uploader': video_info.get('publisherName'), + } - return {'id': video_info['id'], - 'title': video_info['displayName'], - 'url': best_format['defaultURL'], + renditions = video_info.get('renditions') + if renditions: + renditions = sorted(renditions, key=lambda r: r['size']) + best_format = renditions[-1] + info.update({ + 'url': best_format['defaultURL'], 'ext': 'mp4', - 'description': video_info.get('shortDescription'), - 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), - 'uploader': video_info.get('publisherName'), - } + }) + elif video_info.get('FLVFullLengthURL') is not None: + info.update({ + 'url': video_info['FLVFullLengthURL'], + 'ext': 'flv', + }) + else: + raise ExtractorError(u'Unable to extract video url for %s' % info['id']) + return info diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f92e61fea..764070635 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,17 +29,6 @@ class GenericIE(InfoExtractor): u"title": u"R\u00e9gis plante sa Jeep" } }, - { - u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', - u'file': u'2371591881001.mp4', - u'md5': u'9e80619e0a94663f0bdc849b4566af19', - u'note': u'Test Brightcove downloads and detection in GenericIE', - u'info_dict': { - u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', - u'uploader': u'8TV', - u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', - } - }, ] def report_download_webpage(self, video_id): -- cgit v1.2.3 From 4de1994b6ed61a2aaddeee6452959d645fe5954b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 26 Sep 2013 18:59:56 +0200 Subject: [brightcove] Use direct url for the tests The test_all_urls.py test failed because BrightcoveIE doesn't match them. --- youtube_dl/extractor/brightcove.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 859baae75..558b3d009 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -20,7 +20,8 @@ class BrightcoveIE(InfoExtractor): _TESTS = [ { - u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', + # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ + u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', u'md5': u'9e80619e0a94663f0bdc849b4566af19', u'note': u'Test Brightcove downloads and detection in GenericIE', @@ -31,7 +32,8 @@ class BrightcoveIE(InfoExtractor): } }, { - u'url': u'http://medianetwork.oracle.com/video/player/1785452137001', + # From http://medianetwork.oracle.com/video/player/1785452137001 + u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', u'file': u'1785452137001.flv', u'info_dict': { u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', -- cgit v1.2.3 From ce65fb6c76e4496a35cd597bbc735e0351d82853 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Fri, 27 Sep 2013 05:50:16 +0200 Subject: [RTLnowIE] Add support for http://rtlnitronow.de --- youtube_dl/extractor/rtlnow.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 7bb236c2b..963e0cc8f 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -8,8 +8,8 @@ from ..utils import ( ) class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" - _VALID_URL = r'(?:http://)?(?P(?Prtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW and VOX NOW""" + _VALID_URL = r'(?:http://)?(?P(?Prtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -61,6 +61,19 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://www.rtlnitronow.de/recht-ordnung/fahrradpolizei-koeln-fischereiaufsicht-ruegen.php?film_id=124311&player=1&season=1', + u'file': u'124311.flv', + u'info_dict': { + u'upload_date': u'20130830', + u'title': u'Recht & Ordnung - Fahrradpolizei Köln & Fischereiaufsicht Rügen', + u'description': u'Fahrradpolizei Köln & Fischereiaufsicht Rügen', + u'thumbnail': u'http://autoimg.static-fra.de/nitronow/338273/1500x1500/image2.jpg' + }, + u'params': { + u'skip_download': True, + }, }] def _real_extract(self,url): -- cgit v1.2.3 From 63efc427cd4a2e0892e02e0519134d760b30814a Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Fri, 27 Sep 2013 06:00:37 +0200 Subject: [RTLnowIE] Clean video title The title of some videos has the following format: Series - Episode | Series online schauen bei ... NOW --- youtube_dl/extractor/rtlnow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 7bb236c2b..3783aa538 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -79,7 +79,7 @@ class RTLnowIE(InfoExtractor): msg = clean_html(note_m.group(1)) raise ExtractorError(msg) - video_title = self._html_search_regex(r'(?P<title>[^<]+)', + video_title = self._html_search_regex(r'(?P<title>[^<]+?)( \| [^<]*)?', webpage, u'title') playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', webpage, u'playerdata_url') -- cgit v1.2.3 From 920de7a27d11a8f162e108c5891de70db738693a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 27 Sep 2013 06:15:21 +0200 Subject: [youtube] Fix 83 signature (Closes #1511) --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6beda8f3b..89c41efe5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1067,6 +1067,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self._downloader.report_warning( u'Warning: Falling back to static signature algorithm') + return self._static_decrypt_signature( s, video_id, player_url, age_gate) @@ -1098,7 +1099,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif len(s) == 84: return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] elif len(s) == 83: - return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] + return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] elif len(s) == 82: return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54] elif len(s) == 81: -- cgit v1.2.3 From 2dc592991aac5e0b3b91e3d2123490184033177e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 27 Sep 2013 14:20:52 +0200 Subject: [youtube] update description of test --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 89c41efe5..9aee2ebf2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -361,7 +361,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:bdac09887d209a4ed54b8f76b2bdaa8b", + u"description": u"md5:5b292926389560516e384ac437c0ec07", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } -- cgit v1.2.3 From f490e77e77c9db082e073f002088d021b16513ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 27 Sep 2013 14:22:36 +0200 Subject: [youtube] Set the thumbnail to None if it can't be extracted --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9aee2ebf2..618d87515 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1360,7 +1360,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_thumbnail = m_thumb.group(1) elif 'thumbnail_url' not in video_info: self._downloader.report_warning(u'unable to extract video thumbnail') - video_thumbnail = '' + video_thumbnail = None else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) -- cgit v1.2.3 From 9abb32045a85e1ecc831c624494ad41af3997e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 27 Sep 2013 15:06:27 +0200 Subject: [youtube] Add hlsvp to the error message if it can't be found and remove the live stream test It's no longer available, other olympics streams have the same problem. --- youtube_dl/extractor/youtube.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 618d87515..53f13b516 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -378,21 +378,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"justintimberlakeVEVO" } }, - { - u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE', - u'file': u'TGi3HqYrWHE.mp4', - u'note': u'm3u8 video', - u'info_dict': { - u'title': u'Triathlon - Men - London 2012 Olympic Games', - u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games', - u'uploader': u'olympic', - u'upload_date': u'20120807', - u'uploader_id': u'olympic', - }, - u'params': { - u'skip_download': True, - }, - }, ] @@ -1480,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return else: - raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') + raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') results = [] for format_param, video_real_url in video_url_list: -- cgit v1.2.3 From 0b7c2485b66d53ad14bc331e867927b370599e43 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 28 Sep 2013 15:43:34 +0200 Subject: [zdf] Add support for hash URLs and simplify (#1518) --- youtube_dl/extractor/zdf.py | 74 +++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 32 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 418509cb9..faed7ff7f 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -2,16 +2,14 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, - unescapeHTML, ) + class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P[^/\?]+)(?:\?.*)?' - _TITLE = r'(?P.*)</h1>' + _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' - _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -19,6 +17,9 @@ class ZDFIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') + if mobj.group('hash'): + url = url.replace(u'#', u'', 1) + html = self._download_webpage(url, video_id) streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] if streams is None: @@ -27,39 +28,48 @@ class ZDFIE(InfoExtractor): # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url # choose first/default media type and highest quality for now - for s in streams: #find 300 - dsl1000mbit - if s['quality'] == '300' and s['media_type'] == 'wstreaming': - stream_=s - break - for s in streams: #find veryhigh - dsl2000mbit - if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working - stream_=s - break - if stream_ is None: + def stream_pref(s): + TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] + try: + type_pref = TYPE_ORDER.index(s['media_type']) + except ValueError: + type_pref = 999 + + QUALITY_ORDER = ['veryhigh', '300'] + try: + quality_pref = QUALITY_ORDER.index(s['quality']) + except ValueError: + quality_pref = 999 + + return (type_pref, quality_pref) + + sorted_streams = sorted(streams, key=stream_pref) + if not sorted_streams: raise ExtractorError(u'No stream found.') + stream = sorted_streams[0] - media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') + media_link = self._download_webpage( + stream['video_url'], + video_id, + u'Get stream URL') - self.report_extraction(video_id) - mobj = re.search(self._TITLE, html) - if mobj is None: - raise ExtractorError(u'Cannot extract title') - title = unescapeHTML(mobj.group('title')) + MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' + RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' - mobj = re.search(self._MMS_STREAM, media_link) + mobj = re.search(self._MEDIA_STREAM, media_link) if mobj is None: - mobj = re.search(self._RTSP_STREAM, media_link) + mobj = re.search(RTSP_STREAM, media_link) if mobj is None: raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - mms_url = mobj.group('video_url') + video_url = mobj.group('video_url') - mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) - if mobj is None: - raise ExtractorError(u'Cannot extract extention') - ext = mobj.group('ext') + title = self._html_search_regex( + r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', + html, u'title') - return [{'id': video_id, - 'url': mms_url, - 'title': title, - 'ext': ext - }] + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': determine_ext(video_url) + } -- cgit v1.2.3 From 9c15e9de849641143e7654f2656c68e066fe9e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 28 Sep 2013 21:19:52 +0200 Subject: [yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. --- youtube_dl/extractor/yahoo.py | 132 +++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 67 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 32d5b9477..39126e631 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,4 +1,3 @@ -import datetime import itertools import json import re @@ -6,86 +5,85 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( compat_urllib_parse, - - ExtractorError, + compat_urlparse, + determine_ext, + clean_html, ) + class YahooIE(InfoExtractor): IE_DESC = u'Yahoo screen' _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' - _TEST = { - u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - u'file': u'214727115.flv', - u'md5': u'2e717f169c1be93d84d3794a00d4a325', - u'info_dict': { - u"title": u"Julian Smith & Travis Legg Watch Julian Smith" + _TESTS = [ + { + u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + u'file': u'214727115.mp4', + u'info_dict': { + u'title': u'Julian Smith & Travis Legg Watch Julian Smith', + u'description': u'Julian and Travis watch Julian Smith', + }, }, - u'skip': u'Requires rtmpdump' - } + { + u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', + u'file': u'103000935.flv', + u'info_dict': { + u'title': u'The Cougar Lies with Spanish Moss', + u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage) - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - Date: Sun, 29 Sep 2013 12:44:02 +0200 Subject: [dailymotion] Disable the family filter in the playlists (fixes #1524) --- youtube_dl/extractor/dailymotion.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 64b89aae8..3f012aedc 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -14,8 +14,15 @@ from ..utils import ( ExtractorError, ) +class DailymotionBaseInfoExtractor(InfoExtractor): + @staticmethod + def _build_request(url): + """Build a request with the family filter disabled""" + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'family_filter=off') + return request -class DailymotionIE(SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' @@ -40,8 +47,7 @@ class DailymotionIE(SubtitlesInfoExtractor): url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') + request = self._build_request(url) webpage = self._download_webpage(request, video_id) # Extract URL, uploader and title from webpage @@ -113,7 +119,7 @@ class DailymotionIE(SubtitlesInfoExtractor): return {} -class DailymotionPlaylistIE(InfoExtractor): +class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = u'dailymotion:playlist' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' _MORE_PAGES_INDICATOR = r'
' @@ -122,7 +128,8 @@ class DailymotionPlaylistIE(InfoExtractor): def _extract_entries(self, id): video_ids = [] for pagenum in itertools.count(1): - webpage = self._download_webpage(self._PAGE_TEMPLATE % (id, pagenum), + request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum)) + webpage = self._download_webpage(request, id, u'Downloading page %s' % pagenum) playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) -- cgit v1.2.3 From 843530568f326294d714b5b9f11bbf6176d73ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 29 Sep 2013 20:49:58 +0200 Subject: [appletrailers] Rework extraction (fixes #1387) The exraction was broken: * The includes page contains img elements that need to be fixed. * Use the 'itunes.inc' page, it contains a json dictionary for each trailer with information. * Get the formats from 'includes/settings{trailer_name}.json' * Use urljoin to allow urls with a fragment identifier to work Removed the thumbnail urls from the tests, they are different now. --- youtube_dl/extractor/appletrailers.py | 112 +++++++++++++--------------------- 1 file changed, 42 insertions(+), 70 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 8b191c196..b86c4b909 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,8 +1,10 @@ import re import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( + compat_urlparse, determine_ext, ) @@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor): u"playlist": [ { u"file": u"manofsteel-trailer4.mov", - u"md5": u"11874af099d480cc09e103b189805d5f", + u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8", u"info_dict": { u"duration": 111, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", u"title": u"Trailer 4", u"upload_date": u"20130523", u"uploader_id": u"wb", @@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer3.mov", - u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"md5": u"b8017b7131b721fb4e8d6f49e1df908c", u"info_dict": { u"duration": 182, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", u"title": u"Trailer 3", u"upload_date": u"20130417", u"uploader_id": u"wb", @@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer.mov", - u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"md5": u"d0f1e1150989b9924679b441f3404d48", u"info_dict": { u"duration": 148, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", u"title": u"Trailer", u"upload_date": u"20121212", u"uploader_id": u"wb", @@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-teaser.mov", - u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"md5": u"5fe08795b943eb2e757fa95cb6def1cb", u"info_dict": { u"duration": 93, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", u"title": u"Teaser", u"upload_date": u"20120721", u"uploader_id": u"wb", @@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor): ] } + _JSON_RE = r'iTunes.playURL\((.*?)\);' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet) + playlist_cleaned = re.sub(r'(?s).*?', u'', playlist_snippet) + playlist_cleaned = re.sub(r'', r'', playlist_cleaned) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # with xml.etree.ElementTree.fromstring + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) playlist_html = u'' + playlist_cleaned + u'' - size_cache = {} - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): - title = li.find('.//h3').text + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, u'trailer info') + trailer_info = json.loads(trailer_info_json) + title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') - date_el = li.find('.//p') - upload_date = None - m = re.search(r':\s?(?P[0-9]{2})/(?P[0-9]{2})/(?P[0-9]{2})', date_el.text) - if m: - upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') - runtime_el = date_el.find('./br') - m = re.search(r':\s?(?P[0-9]+):(?P[0-9]{1,2})', runtime_el.tail) + runtime = trailer_info['runtime'] + m = re.search(r'(?P[0-9]+):(?P[0-9]{1,2})', runtime) duration = None if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - formats = [] - for formats_el in li.findall('.//a'): - if formats_el.attrib['class'] != 'OverlayPanel': - continue - target = formats_el.attrib['target'] - - format_code = formats_el.text - if 'Automatic' in format_code: - continue + first_url = trailer_info['url'] + trailer_id = first_url.split('/')[-1].rpartition('_')[0] + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') + settings = json.loads(settings_json) - size_q = formats_el.attrib['href'] - size_id = size_q.rpartition('#videos-')[2] - if size_id not in size_cache: - size_url = url + size_q - sizepage_html = self._download_webpage( - size_url, movie, - note=u'Downloading size info %s' % size_id, - errnote=u'Error while downloading size info %s' % size_id, - ) - _doc = xml.etree.ElementTree.fromstring(sizepage_html) - size_cache[size_id] = _doc - - sizepage_doc = size_cache[size_id] - links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') - for vid_a in links: - href = vid_a.get('href') - if not href.endswith(target): - continue - detail_q = href.partition('#')[0] - detail_url = url + '/' + detail_q - - m = re.match(r'includes/(?P[^/]+)/', detail_q) - detail_id = m.group('detail_id') - - detail_html = self._download_webpage( - detail_url, movie, - note=u'Downloading detail %s %s' % (detail_id, size_id), - errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) - ) - detail_doc = xml.etree.ElementTree.fromstring(detail_html) - movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') - assert movie_link_el.get('class') == 'movieLink' - movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') - ext = determine_ext(movie_link) - assert ext == 'mov' - - formats.append({ - 'format': format_code, - 'ext': ext, - 'url': movie_link, - }) + formats = [] + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'ext': determine_ext(format_url), + 'format': format['type'], + 'width': format['width'], + 'height': int(format['height']), + }) + formats = sorted(formats, key=lambda f: (f['height'], f['width'])) info = { '_type': 'video', -- cgit v1.2.3 From bb4aa62cf7ad3d5aae4edf56ab8954c80a2d8956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 29 Sep 2013 20:59:19 +0200 Subject: [appletrailers] The request for the settings must have the trailer name in lower case (fixes #1329) --- youtube_dl/extractor/appletrailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index b86c4b909..6d6237f8a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -95,7 +95,7 @@ class AppleTrailersIE(InfoExtractor): duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) first_url = trailer_info['url'] - trailer_id = first_url.split('/')[-1].rpartition('_')[0] + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') settings = json.loads(settings_json) -- cgit v1.2.3 From 722076a123c60ed6d5a978c4bc2609f46c8e3ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 29 Sep 2013 23:07:26 +0200 Subject: [rtlnow] Replace one of the tests The video is no longer available. --- youtube_dl/extractor/rtlnow.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index e6fa0475e..32541077f 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -63,13 +63,13 @@ class RTLnowIE(InfoExtractor): }, }, { - u'url': u'http://www.rtlnitronow.de/recht-ordnung/fahrradpolizei-koeln-fischereiaufsicht-ruegen.php?film_id=124311&player=1&season=1', - u'file': u'124311.flv', + u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1', + u'file': u'127367.flv', u'info_dict': { - u'upload_date': u'20130830', - u'title': u'Recht & Ordnung - Fahrradpolizei Köln & Fischereiaufsicht Rügen', - u'description': u'Fahrradpolizei Köln & Fischereiaufsicht Rügen', - u'thumbnail': u'http://autoimg.static-fra.de/nitronow/338273/1500x1500/image2.jpg' + u'upload_date': u'20130926', + u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...', + u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin', + u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg', }, u'params': { u'skip_download': True, -- cgit v1.2.3