diff options
Diffstat (limited to 'youtube_dl')
41 files changed, 1403 insertions, 580 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3dff723b8..686988fe5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1197,6 +1197,10 @@ class YoutubeDL(object): if res: res += ', ' res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) return res def list_formats(self, info_dict): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6e2359b28..c6a5b2b5b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -65,17 +65,16 @@ __authors__ = ( 'Tobias Bell', 'Naglis Jonaitis', 'Charles Chen', + 'Hassaan Ali', ) __license__ = 'Public Domain' import codecs import io -import locale import optparse import os import random -import re import shlex import sys @@ -634,7 +633,7 @@ def _real_main(argv=None): if desc is False: continue if hasattr(ie, 'SEARCH_KEY'): - _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise') + _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise', u'sleeping bunny') _COUNTS = (u'', u'5', u'10', u'all') desc += u' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) compat_print(desc) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c5961cab9..8d63d9281 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,5 +1,6 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE +from .adultswim import AdultSwimIE from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE from .aol import AolIE @@ -52,6 +53,7 @@ from .cnn import ( from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE +from .cracked import CrackedIE from .criterion import CriterionIE from .crunchyroll import CrunchyrollIE from .cspan import CSpanIE @@ -62,6 +64,7 @@ from .dailymotion import ( DailymotionUserIE, ) from .daum import DaumIE +from .dfb import DFBIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .drtv import DRTVIE @@ -250,6 +253,7 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .sapo import SapoIE from .savefrom import SaveFromIE from .scivee import SciVeeIE from .screencast import ScreencastIE @@ -263,6 +267,8 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snotr import SnotrIE +from .sockshare import SockshareIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, @@ -397,6 +403,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) + from .zdf import ZDFIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py new file mode 100644 index 000000000..a00bfcb35 --- /dev/null +++ b/youtube_dl/extractor/adultswim.py @@ -0,0 +1,139 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class AdultSwimIE(InfoExtractor): + _VALID_URL = r'https?://video\.adultswim\.com/(?P<path>.+?)(?:\.html)?(?:\?.*)?(?:#.*)?$' + _TEST = { + 'url': 'http://video.adultswim.com/rick-and-morty/close-rick-counters-of-the-rick-kind.html?x=y#title', + 'playlist': [ + { + 'md5': '4da359ec73b58df4575cd01a610ba5dc', + 'info_dict': { + 'id': '8a250ba1450996e901453d7f02ca02f5', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 1', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + }, + { + 'md5': 'ffbdf55af9331c509d95350bd0cc1819', + 'info_dict': { + 'id': '8a250ba1450996e901453d7f4bd102f6', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 2', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + }, + { + 'md5': 'b92409635540304280b4b6c36bd14a0a', + 'info_dict': { + 'id': '8a250ba1450996e901453d7fa73c02f7', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 3', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + }, + { + 'md5': 'e8818891d60e47b29cd89d7b0278156d', + 'info_dict': { + 'id': '8a250ba1450996e901453d7fc8ba02f8', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 4', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + } + ] + } + + _video_extensions = { + '3500': 'flv', + '640': 'mp4', + '150': 'mp4', + 'ipad': 'm3u8', + 'iphone': 'm3u8' + } + _video_dimensions = { + '3500': (1280, 720), + '640': (480, 270), + '150': (320, 180) + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_path = mobj.group('path') + + webpage = self._download_webpage(url, video_path) + episode_id = self._html_search_regex(r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>', webpage, 'episode_id') + title = self._og_search_title(webpage) + + index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id + idoc = self._download_xml(index_url, title, 'Downloading episode index', 'Unable to download episode index') + + episode_el = idoc.find('.//episode') + show_title = episode_el.attrib.get('collectionTitle') + episode_title = episode_el.attrib.get('title') + thumbnail = episode_el.attrib.get('thumbnailUrl') + description = episode_el.find('./description').text.strip() + + entries = [] + segment_els = episode_el.findall('./segments/segment') + + for part_num, segment_el in enumerate(segment_els): + segment_id = segment_el.attrib.get('id') + segment_title = '%s %s part %d' % (show_title, episode_title, part_num + 1) + thumbnail = segment_el.attrib.get('thumbnailUrl') + duration = segment_el.attrib.get('duration') + + segment_url = 'http://asfix.adultswim.com/asfix-svc/episodeservices/getCvpPlaylist?networkName=AS&id=%s' % segment_id + idoc = self._download_xml(segment_url, segment_title, 'Downloading segment information', 'Unable to download segment information') + + formats = [] + file_els = idoc.findall('.//files/file') + + for file_el in file_els: + bitrate = file_el.attrib.get('bitrate') + type = file_el.attrib.get('type') + width, height = self._video_dimensions.get(bitrate, (None, None)) + formats.append({ + 'format_id': '%s-%s' % (bitrate, type), + 'url': file_el.text, + 'ext': self._video_extensions.get(bitrate, 'mp4'), + # The bitrate may not be a number (for example: 'iphone') + 'tbr': int(bitrate) if bitrate.isdigit() else None, + 'height': height, + 'width': width + }) + + self._sort_formats(formats) + + entries.append({ + 'id': segment_id, + 'title': segment_title, + 'formats': formats, + 'uploader': show_title, + 'thumbnail': thumbnail, + 'duration': duration, + 'description': description + }) + + return { + '_type': 'playlist', + 'id': episode_id, + 'display_id': video_path, + 'entries': entries, + 'title': '%s %s' % (show_title, episode_title), + 'description': description, + 'thumbnail': thumbnail + } diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 34f0cd49b..7bd797884 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -32,7 +32,7 @@ class AllocineIE(InfoExtractor): 'id': '19540403', 'ext': 'mp4', 'title': 'Planes 2 Bande-annonce VF', - 'description': 'md5:c4b1f7bd682a91de6491ada267ec0f4d', + 'description': 'md5:eeaffe7c2d634525e21159b93acf3b1e', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -42,7 +42,7 @@ class AllocineIE(InfoExtractor): 'id': '19544709', 'ext': 'mp4', 'title': 'Dragons 2 - Bande annonce finale VF', - 'description': 'md5:e74a4dc750894bac300ece46c7036490', + 'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac', 'thumbnail': 're:http://.*\.jpg', }, }] diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index b36a4d46a..30a85c8c1 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -7,23 +7,32 @@ from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, + qualities, ) class ARDIE(InfoExtractor): - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - _TEST = { - 'url': 'http://www.ardmediathek.de/das-erste/guenther-jauch/edward-snowden-im-interview-held-oder-verraeter?documentId=19288786', - 'file': '19288786.mp4', - 'md5': '515bf47ce209fb3f5a61b7aad364634c', + _TESTS = [{ + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'file': '22429276.mp4', + 'md5': '469751912f1de0816a9fc9df8336476c', 'info_dict': { - 'title': 'Edward Snowden im Interview - Held oder Verräter?', - 'description': 'Edward Snowden hat alles aufs Spiel gesetzt, um die weltweite \xdcberwachung durch die Geheimdienste zu enttarnen. Nun stellt sich der ehemalige NSA-Mitarbeiter erstmals weltweit in einem TV-Interview den Fragen eines NDR-Journalisten. Die Sendung vom Sonntagabend.', - 'thumbnail': 'http://www.ardmediathek.de/ard/servlet/contentblob/19/28/87/90/19288790/bild/2250037', + 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?', + 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014', }, 'skip': 'Blocked outside of Germany', - } + }, { + 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916', + 'info_dict': { + 'id': '22490580', + 'ext': 'mp4', + 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)', + 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', + }, + 'skip': 'Blocked outside of Germany', + }] def _real_extract(self, url): # determine video id from url @@ -43,40 +52,64 @@ class ARDIE(InfoExtractor): r'<h4 class="headline">(.*?)</h4>'], webpage, 'title') description = self._html_search_meta( - 'dcterms.abstract', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - - - media_info = self._download_json( - 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) - # The second element of the _mediaArray contains the standard http urls - streams = media_info['_mediaArray'][1]['_mediaStreamArray'] - if not streams: - if '"fsk"' in webpage: - raise ExtractorError('This video is only available after 20:00') - - formats = [] - - for s in streams: - if type(s['_stream']) == list: - for index, url in enumerate(s['_stream'][::-1]): - quality = s['_quality'] + index - formats.append({ - 'quality': quality, - 'url': url, - 'format_id': '%s-%s' % (determine_ext(url), quality) + 'dcterms.abstract', webpage, 'description', default=None) + if description is None: + description = self._html_search_meta( + 'description', webpage, 'meta description') + + # Thumbnail is sometimes not present. + # It is in the mobile version, but that seems to use a different URL + # structure altogether. + thumbnail = self._og_search_thumbnail(webpage, default=None) + + media_streams = re.findall(r'''(?x) + mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* + "([^"]+)"''', webpage) + + if media_streams: + QUALITIES = qualities(['lo', 'hi', 'hq']) + formats = [] + for furl in set(media_streams): + if furl.endswith('.f4m'): + fid = 'f4m' + else: + fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) + fid = fid_m.group(1) if fid_m else None + formats.append({ + 'quality': QUALITIES(fid), + 'format_id': fid, + 'url': furl, + }) + else: # request JSON file + media_info = self._download_json( + 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) + # The second element of the _mediaArray contains the standard http urls + streams = media_info['_mediaArray'][1]['_mediaStreamArray'] + if not streams: + if '"fsk"' in webpage: + raise ExtractorError('This video is only available after 20:00') + + formats = [] + for s in streams: + if type(s['_stream']) == list: + for index, url in enumerate(s['_stream'][::-1]): + quality = s['_quality'] + index + formats.append({ + 'quality': quality, + 'url': url, + 'format_id': '%s-%s' % (determine_ext(url), quality) }) - continue + continue - format = { - 'quality': s['_quality'], - 'url': s['_stream'], - } + format = { + 'quality': s['_quality'], + 'url': s['_stream'], + } - format['format_id'] = '%s-%s' % ( - determine_ext(format['url']), format['quality']) + format['format_id'] = '%s-%s' % ( + determine_ext(format['url']), format['quality']) - formats.append(format) + formats.append(format) self._sort_formats(formats) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 993360714..f7f2f713a 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -12,7 +12,7 @@ from ..utils import ( class BRIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-]+/)+(?P<id>[a-z0-9\-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _BASE_URL = 'http://www.br.de' _TESTS = [ diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index ac0315853..822f9a7be 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,24 +1,42 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*' - _TEST = { - u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', - u'file': u'4JUVEwq3wUT7.flv', - u'info_dict': { - u'title': u'Connect Chat feat. Garth Brooks', - u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - u'duration': 1495, + _TESTS = [{ + 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'info_dict': { + 'id': '4JUVEwq3wUT7', + 'ext': 'flv', + 'title': 'Connect Chat feat. Garth Brooks', + 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + 'duration': 1495, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + '_skip': 'Blocked outside the US', + }, { + 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', + 'info_dict': { + 'id': 'P9gjWjelt6iP', + 'ext': 'flv', + 'title': 'Live on Letterman - St. Vincent', + 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', + 'duration': 3221, }, - u'params': { + 'params': { # rtmp download - u'skip_download': True, + 'skip_download': True, }, - } + '_skip': 'Blocked outside the US', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -26,5 +44,5 @@ class CBSIE(InfoExtractor): webpage = self._download_webpage(url, video_id) real_id = self._search_regex( r"video\.settings\.pid\s*=\s*'([^']+)';", - webpage, u'real video ID') + webpage, 'real video ID') return self.url_result(u'theplatform:%s' % real_id) diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 02d5ba527..a62395d4b 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -42,7 +42,7 @@ class ChilloutzoneIE(InfoExtractor): 'id': '85523671', 'ext': 'mp4', 'title': 'The Sunday Times - Icons', - 'description': 'md5:3e1c0dc6047498d6728dcdaad0891762', + 'description': 'md5:a5f7ff82e2f7a9ed77473fe666954e84', 'uploader': 'Us', 'uploader_id': 'usfilms', 'upload_date': '20140131' diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index a94f42571..710d5009b 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -43,7 +43,11 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') video_id = vdata['id'] - title = vdata['headline'] + title = vdata.get('headline') + if title is None: + title = vdata.get('title') + if title is None: + raise ExtractorError('Cannot find title!') description = vdata.get('dek') thumbnail = vdata.get('image', {}).get('path') author = vdata.get('author') diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 8af0abade..c81ce5a96 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -14,13 +14,13 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/ - (video-clips|episodes|cc-studios|video-collections) + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ + (video-clips|episodes|cc-studios|video-collections|full-episodes) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TEST = { - 'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e68657314..9b36e0789 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -69,6 +69,7 @@ class InfoExtractor(object): * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -468,7 +469,7 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)<meta - (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) + (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=fatal, **kwargs) @@ -555,6 +556,7 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, + f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py new file mode 100644 index 000000000..74b880ffc --- /dev/null +++ b/youtube_dl/extractor/cracked.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + str_to_int, +) + + +class CrackedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html' + _TEST = { + 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', + 'md5': '4b29a5eeec292cd5eca6388c7558db9e', + 'info_dict': { + 'id': '19006', + 'ext': 'mp4', + 'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies', + 'description': 'md5:3b909e752661db86007d10e5ec2df769', + 'timestamp': 1405659600, + 'upload_date': '20140718', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], webpage, 'video URL') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + timestamp = self._html_search_regex(r'<time datetime="([^"]+)"', webpage, 'upload date', fatal=False) + if timestamp: + timestamp = parse_iso8601(timestamp[:-6]) + + view_count = str_to_int(self._html_search_regex( + r'<span class="views" id="viewCounts">([\d,\.]+) Views</span>', webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'<span id="commentCounts">([\d,\.]+)</span>', webpage, 'comment count', fatal=False)) + + m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url) + if m: + width = int(m.group('width')) + height = int(m.group('height')) + else: + width = height = None + + return { + 'id': video_id, + 'url':video_url, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'height': height, + 'width': width, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py new file mode 100644 index 000000000..cb8e06822 --- /dev/null +++ b/youtube_dl/extractor/dfb.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DFBIE(InfoExtractor): + IE_NAME = 'tv.dfb.de' + _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)' + + _TEST = { + 'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/', + # The md5 is different each time + 'info_dict': { + 'id': '9070', + 'ext': 'flv', + 'title': 'Highlights des Empfangs in Berlin', + 'upload_date': '20140716', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_info = self._download_xml( + 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, + video_id) + video_info = player_info.find('video') + + f4m_info = self._download_xml(video_info.find('url').text, video_id) + token_el = f4m_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' + + return { + 'id': video_id, + 'title': video_info.find('title').text, + 'url': manifest_url, + 'ext': 'flv', + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]), + } diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 41208c976..9f569aa93 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,24 +5,26 @@ import os.path import re from .common import InfoExtractor +from ..utils import compat_urllib_parse_unquote class DropboxIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)' _TEST = { - 'url': 'https://www.dropbox.com/s/0qr9sai2veej4f8/THE_DOCTOR_GAMES.mp4', - 'md5': '8ae17c51172fb7f93bdd6a214cc8c896', + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4', + 'md5': '8a3d905427a6951ccb9eb292f154530b', 'info_dict': { - 'id': '0qr9sai2veej4f8', + 'id': 'nelirfsxnmcfbfh', 'ext': 'mp4', - 'title': 'THE_DOCTOR_GAMES' + 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - title = os.path.splitext(mobj.group('title'))[0] + fn = compat_urllib_parse_unquote(mobj.group('title')) + title = os.path.splitext(fn)[0] video_url = url + '?dl=1' return { diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py index d26145db1..6d73c8a4a 100644 --- a/youtube_dl/extractor/firedrive.py +++ b/youtube_dl/extractor/firedrive.py @@ -8,7 +8,6 @@ from ..utils import ( ExtractorError, compat_urllib_parse, compat_urllib_request, - determine_ext, ) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f3e0f38b7..1fbe6d175 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -48,7 +48,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://www\.francetvinfo\.fr/.*/(?P<title>.+)\.html' + _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -211,7 +211,7 @@ class GenerationQuoiIE(InfoExtractor): class CultureboxIE(FranceTVBaseInfoExtractor): IE_NAME = 'culturebox.francetvinfo.fr' - _VALID_URL = r'https?://culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' + _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' _TEST = { 'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 6e6b66660..721e5fce0 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -26,7 +26,7 @@ class FunnyOrDieIE(InfoExtractor): 'id': 'e402820827', 'ext': 'mp4', 'title': 'Please Use This Song (Jon Lajoie)', - 'description': 'md5:2ed27d364f5a805a6dba199faaf6681d', + 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': 're:^http:.*\.jpg$', }, }] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f97b59845..9db27f9aa 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -402,7 +402,7 @@ class GenericIE(InfoExtractor): elif default_search == 'error': raise ExtractorError( ('%r is not a valid URL. ' - 'Set --default-search "ytseach" (or run youtube-dl "ytsearch:%s" ) to search YouTube' + 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: assert ':' in default_search diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 961dd1aa6..56a76380c 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class KickStarterIE(InfoExtractor): _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*' - _TEST = { + _TESTS = [{ 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location', 'md5': 'c81addca81327ffa66c642b5d8b08cab', 'info_dict': { @@ -18,22 +18,45 @@ class KickStarterIE(InfoExtractor): 'description': 'A unique motocross documentary that examines the ' 'life and mind of one of sports most elite athletes: Josh Grant.', }, - } + }, { + 'note': 'Embedded video (not using the native kickstarter video service)', + 'url': 'https://www.kickstarter.com/projects/597507018/pebble-e-paper-watch-for-iphone-and-android/posts/659178', + 'playlist': [ + { + 'info_dict': { + 'id': '78704821', + 'ext': 'mp4', + 'uploader_id': 'pebble', + 'uploader': 'Pebble Technology', + 'title': 'Pebble iOS Notifications', + } + } + ], + }] def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'data-video-url="(.*?)"', - webpage, 'video URL') - video_title = self._html_search_regex(r'<title>(.*?)</title>', - webpage, 'title').rpartition('— Kickstarter')[0].strip() + title = self._html_search_regex( + r'<title>\s*(.*?)(?:\s*— Kickstarter)?\s*</title>', + webpage, 'title') + video_url = self._search_regex( + r'data-video-url="(.*?)"', + webpage, 'video URL', default=None) + if video_url is None: # No native kickstarter, look for embedded videos + return { + '_type': 'url_transparent', + 'ie_key': 'Generic', + 'url': url, + 'title': title, + } return { 'id': video_id, 'url': video_url, - 'title': video_title, + 'title': title, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 2c100d424..1ea1bbab4 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -28,11 +28,13 @@ class LivestreamIE(InfoExtractor): } def _extract_video_info(self, video_data): - video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') + video_url = ( + video_data.get('progressive_url_hd') or + video_data.get('progressive_url') + ) return { 'id': compat_str(video_data['id']), 'url': video_url, - 'ext': 'mp4', 'title': video_data['caption'], 'thumbnail': video_data['thumbnail_url'], 'upload_date': video_data['updated_at'].replace('-', '')[:8], @@ -50,7 +52,8 @@ class LivestreamIE(InfoExtractor): r'window.config = ({.*?});', webpage, 'window config') info = json.loads(config_json)['event'] videos = [self._extract_video_info(video_data['data']) - for video_data in info['feed']['data'] if video_data['type'] == 'video'] + for video_data in info['feed']['data'] + if video_data['type'] == 'video'] return self.playlist_result(videos, info['id'], info['full_name']) else: og_video = self._og_search_video_url(webpage, 'player url') diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index aa34665d1..70aa98aee 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -85,11 +85,25 @@ class NBCNewsIE(InfoExtractor): flags=re.MULTILINE) bootstrap = json.loads(bootstrap_json) info = bootstrap['results'][0]['video'] - playlist_url = info['fallbackPlaylistUrl'] + '?form=MPXNBCNewsAPI' mpxid = info['mpxId'] - all_videos = self._download_json(playlist_url, title)['videos'] - # The response contains additional videos - info = next(v for v in all_videos if v['mpxId'] == mpxid) + + base_urls = [ + info['fallbackPlaylistUrl'], + info['associatedPlaylistUrl'], + ] + + for base_url in base_urls: + playlist_url = base_url + '?form=MPXNBCNewsAPI' + all_videos = self._download_json(playlist_url, title)['videos'] + + try: + info = next(v for v in all_videos if v['mpxId'] == mpxid) + break + except StopIteration: + continue + + if info is None: + raise ExtractorError('Could not find video in playlists') return { '_type': 'url', diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index fbcbe1f40..12e85a716 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -32,7 +32,7 @@ class NPOIE(InfoExtractor): 'http://e.omroep.nl/metadata/aflevering/%s' % video_id, video_id, # We have to remove the javascript callback - transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j) + transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//.*$', r'\1', j) ) token_page = self._download_webpage( 'http://ida.omroep.nl/npoplayer/i.js', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 4295cf93a..d1e12dd8d 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -35,9 +35,7 @@ class RedTubeIE(InfoExtractor): r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, u'title') - video_thumbnail = self._html_search_regex( - r'playerInnerHTML.+?<img\s+src="(.+?)"', - webpage, u'thumbnail', fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) # No self-labeling, but they describe themselves as # "Home of Videos Porno" diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 205f8a167..dce64e151 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -30,7 +30,7 @@ class RTBFIE(InfoExtractor): page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) data = json.loads(self._html_search_regex( - r'<div class="js-player-embed" data-video="([^"]+)"', page, 'data video'))['data'] + r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data'] video_url = data.get('downloadUrl') or data.get('url') diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 77fd08dde..c2228b2f0 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ class RTVEALaCartaIE(InfoExtractor): _TEST = { 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '18fcd45965bdd076efdb12cd7f6d7b9e', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', 'info_dict': { 'id': '2491869', 'ext': 'mp4', diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py new file mode 100644 index 000000000..172cc1275 --- /dev/null +++ b/youtube_dl/extractor/sapo.py @@ -0,0 +1,119 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class SapoIE(InfoExtractor): + IE_DESC = 'SAPO Vídeos' + _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})' + + _TESTS = [ + { + 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', + 'md5': '79ee523f6ecb9233ac25075dee0eda83', + 'note': 'SD video', + 'info_dict': { + 'id': 'UBz95kOtiWYUMTA5Ghfi', + 'ext': 'mp4', + 'title': 'Benfica - Marcas na Hitória', + 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', + 'duration': 264, + 'uploader': 'tiago_1988', + 'upload_date': '20080229', + 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], + }, + }, + { + 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', + 'md5': '90a2f283cfb49193fe06e861613a72aa', + 'note': 'HD video', + 'info_dict': { + 'id': 'IyusNAZ791ZdoCY5H5IF', + 'ext': 'mp4', + 'title': 'Codebits VII - Report', + 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', + 'duration': 144, + 'uploader': 'codebits', + 'upload_date': '20140427', + 'categories': ['codebits', 'codebits2014'], + }, + }, + { + 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', + 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', + 'note': 'v2 video', + 'info_dict': { + 'id': 'yLqjzPtbTimsn2wWBKHz', + 'ext': 'mp4', + 'title': 'Hipnose Condicionativa 4', + 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', + 'duration': 692, + 'uploader': 'sapozen', + 'upload_date': '20090609', + 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + item = self._download_xml( + 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + + title = item.find('./title').text + description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text + thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') + duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) + uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text + upload_date = unified_strdate(item.find('./pubDate').text) + view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) + comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) + tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text + categories = tags.split() if tags else [] + age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + + video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text + video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'sd', + 'width': int(video_size[0]), + 'height': int(video_size[1]), + }] + + if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': + formats.append({ + 'url': re.sub(r'/mov/1$', '/mov/39', video_url), + 'ext': 'mp4', + 'format_id': 'hd', + 'width': 1280, + 'height': 720, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 198a08c1c..ccd545971 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -20,7 +20,7 @@ class SaveFromIE(InfoExtractor): 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', - 'description': 'md5:4f0aac94361a12e1ce57d74f85265175', + 'description': 'md5:727900f130df3dc9a25e2721497c7910', }, 'params': { 'skip_download': True diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py new file mode 100644 index 000000000..da3b05a8d --- /dev/null +++ b/youtube_dl/extractor/snotr.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + str_to_int, + parse_duration, +) + + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' + _TESTS = [{ + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'flv', + 'title': 'Drone flying through fireworks!', + 'duration': 247, + 'filesize_approx': 98566144, + 'description': 'A drone flying through Fourth of July Fireworks', + } + }, { + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'flv', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize_approx': 8912896, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id + + view_count = str_to_int(self._html_search_regex( + r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>', + webpage, 'view count', fatal=False)) + + duration = parse_duration(self._html_search_regex( + r'<p>\n<strong>Length:</strong>\n\s*([0-9:]+).*?</p>', + webpage, 'duration', fatal=False)) + + filesize_approx = float_or_none(self._html_search_regex( + r'<p>\n<strong>Filesize:</strong>\n\s*([0-9.]+)\s*megabyte</p>', + webpage, 'filesize', fatal=False), invscale=1024 * 1024) + + return { + 'id': video_id, + 'description': description, + 'title': title, + 'url': video_url, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + } diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py new file mode 100644 index 000000000..75b634bc6 --- /dev/null +++ b/youtube_dl/extractor/sockshare.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, +) +import re + +from .common import InfoExtractor + + +class SockshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)' + _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>' + _TEST = { + 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', + 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', + 'info_dict': { + 'id': '437BE28B89D799D7', + 'title': 'big_buck_bunny_720p_surround.avi', + 'ext': 'avi', + 'thumbnail': 're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://sockshare.com/file/%s' % video_id + webpage = self._download_webpage(url, video_id) + + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, + expected=True) + + confirm_hash = self._html_search_regex(r'''(?x)<input\s+ + type="hidden"\s+ + value="([^"]*)"\s+ + name="hash" + ''', webpage, 'hash') + + fields = { + "hash": confirm_hash, + "confirm": "Continue as Free User" + } + + post = compat_urllib_parse.urlencode(fields) + req = compat_urllib_request.Request(url, post) + # Apparently, this header is required for confirmation to work. + req.add_header('Host', 'www.sockshare.com') + req.add_header('Content-type', 'application/x-www-form-urlencoded') + + webpage = self._download_webpage( + req, video_id, 'Downloading video page') + + video_url = self._html_search_regex( + r'<a href="([^"]*)".+class="download_file_link"', + webpage, 'file url') + video_url = "http://www.sockshare.com" + video_url + title = self._html_search_regex(r'<h1>(.+)<strong>', webpage, 'title') + thumbnail = self._html_search_regex( + r'<img\s+src="([^"]*)".+?name="bg"', + webpage, 'thumbnail') + + formats = [{ + 'format_id': 'sd', + 'url': video_url, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index af689e2c2..183dcb03c 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -53,7 +53,7 @@ class SteamIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20140329', 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': 'md5:6df4fe8dd494ae811869672b0767e025', + 'description': 'md5:dc96a773669d0ca1b36c13c1f30250d9', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 25b9864ad..b87047451 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -19,16 +19,6 @@ class TagesschauIE(InfoExtractor): 'description': 'md5:69da3c61275b426426d711bde96463ab', 'thumbnail': 're:^http:.*\.jpg$', }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', - 'md5': '66652566900963a3f962333579eeffcf', - 'info_dict': { - 'id': '5964', - 'ext': 'mp4', - 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', - 'description': 'md5:07bfc78c48eec3145ed4805299a1900a', - 'thumbnail': 're:http://.*\.jpg', - }, }] _FORMATS = { diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 2c2113b14..46d727d1d 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -62,7 +62,7 @@ class TeacherTubeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('title', webpage, 'title') + title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)].strip() @@ -101,7 +101,11 @@ class TeacherTubeUserIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' - _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">' + _MEDIA_RE = r'''(?sx) + class="?sidebar_thumb_time"?>[0-9:]+</div> + \s* + <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)" + ''' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -111,14 +115,12 @@ class TeacherTubeUserIE(InfoExtractor): webpage = self._download_webpage(url, user_id) urls.extend(re.findall(self._MEDIA_RE, webpage)) - pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] + pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1] for p in pages: more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) - webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) - urls.extend(re.findall(self._MEDIA_RE, webpage)) - - entries = [] - for url in urls: - entries.append(self.url_result(url, 'TeacherTube')) + webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages))) + video_urls = re.findall(self._MEDIA_RE, webpage) + urls.extend(video_urls) + entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls] return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index 8477840fc..81ba169fb 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 255855558..a3c6e83b0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -98,7 +98,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'info_dict': { 'id': '54469442', 'ext': 'mp4', - 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', + 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 68c59364b..6d3b78749 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -10,7 +10,7 @@ from ..utils import ( class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f741ba540..ab28ef6fe 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -55,14 +55,14 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', - 'md5': '24e83813e832badb0a8d7d1ef9ef0691', + 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', + 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', 'info_dict': { - 'id': 'mdb-463528', + 'id': 'mdb-478135', 'ext': 'mp3', - 'title': 'Süpersong: Soul Bossa Nova', + 'title': 'Flavia Coelho: Amar é Amar', 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140630', + 'upload_date': '20140717', }, }, ] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6123e1256..072e711c2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,19 +1,17 @@ # coding: utf-8 -import collections import errno import io import itertools import json import os.path import re -import struct import traceback -import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter +from ..swfinterp import SWFInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -347,8 +345,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'RTMP download detected') def _extract_signature_function(self, video_id, player_url, slen): - id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$', - player_url) + id_m = re.match( + r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$', + player_url) player_type = id_m.group('ext') player_id = id_m.group('id') @@ -449,417 +448,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): - if file_contents[1:3] != b'WS': - raise ExtractorError( - u'Not an SWF file; header is %r' % file_contents[:3]) - if file_contents[:1] == b'C': - content = zlib.decompress(file_contents[8:]) - else: - raise NotImplementedError(u'Unsupported compression format %r' % - file_contents[:1]) - - def extract_tags(content): - pos = 0 - while pos < len(content): - header16 = struct.unpack('<H', content[pos:pos+2])[0] - pos += 2 - tag_code = header16 >> 6 - tag_len = header16 & 0x3f - if tag_len == 0x3f: - tag_len = struct.unpack('<I', content[pos:pos+4])[0] - pos += 4 - assert pos+tag_len <= len(content) - yield (tag_code, content[pos:pos+tag_len]) - pos += tag_len - - code_tag = next(tag - for tag_code, tag in extract_tags(content) - if tag_code == 82) - p = code_tag.index(b'\0', 4) + 1 - code_reader = io.BytesIO(code_tag[p:]) - - # Parse ABC (AVM2 ByteCode) - def read_int(reader=None): - if reader is None: - reader = code_reader - res = 0 - shift = 0 - for _ in range(5): - buf = reader.read(1) - assert len(buf) == 1 - b = struct.unpack('<B', buf)[0] - res = res | ((b & 0x7f) << shift) - if b & 0x80 == 0: - break - shift += 7 - return res - - def u30(reader=None): - res = read_int(reader) - assert res & 0xf0000000 == 0 - return res - u32 = read_int - - def s32(reader=None): - v = read_int(reader) - if v & 0x80000000 != 0: - v = - ((v ^ 0xffffffff) + 1) - return v - - def read_string(reader=None): - if reader is None: - reader = code_reader - slen = u30(reader) - resb = reader.read(slen) - assert len(resb) == slen - return resb.decode('utf-8') - - def read_bytes(count, reader=None): - if reader is None: - reader = code_reader - resb = reader.read(count) - assert len(resb) == count - return resb - - def read_byte(reader=None): - resb = read_bytes(1, reader=reader) - res = struct.unpack('<B', resb)[0] - return res - - # minor_version + major_version - read_bytes(2 + 2) - - # Constant pool - int_count = u30() - for _c in range(1, int_count): - s32() - uint_count = u30() - for _c in range(1, uint_count): - u32() - double_count = u30() - read_bytes((double_count-1) * 8) - string_count = u30() - constant_strings = [u''] - for _c in range(1, string_count): - s = read_string() - constant_strings.append(s) - namespace_count = u30() - for _c in range(1, namespace_count): - read_bytes(1) # kind - u30() # name - ns_set_count = u30() - for _c in range(1, ns_set_count): - count = u30() - for _c2 in range(count): - u30() - multiname_count = u30() - MULTINAME_SIZES = { - 0x07: 2, # QName - 0x0d: 2, # QNameA - 0x0f: 1, # RTQName - 0x10: 1, # RTQNameA - 0x11: 0, # RTQNameL - 0x12: 0, # RTQNameLA - 0x09: 2, # Multiname - 0x0e: 2, # MultinameA - 0x1b: 1, # MultinameL - 0x1c: 1, # MultinameLA - } - multinames = [u''] - for _c in range(1, multiname_count): - kind = u30() - assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind - if kind == 0x07: - u30() # namespace_idx - name_idx = u30() - multinames.append(constant_strings[name_idx]) - else: - multinames.append('[MULTINAME kind: %d]' % kind) - for _c2 in range(MULTINAME_SIZES[kind]): - u30() - - # Methods - method_count = u30() - MethodInfo = collections.namedtuple( - 'MethodInfo', - ['NEED_ARGUMENTS', 'NEED_REST']) - method_infos = [] - for method_id in range(method_count): - param_count = u30() - u30() # return type - for _ in range(param_count): - u30() # param type - u30() # name index (always 0 for youtube) - flags = read_byte() - if flags & 0x08 != 0: - # Options present - option_count = u30() - for c in range(option_count): - u30() # val - read_bytes(1) # kind - if flags & 0x80 != 0: - # Param names present - for _ in range(param_count): - u30() # param name - mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) - method_infos.append(mi) - - # Metadata - metadata_count = u30() - for _c in range(metadata_count): - u30() # name - item_count = u30() - for _c2 in range(item_count): - u30() # key - u30() # value - - def parse_traits_info(): - trait_name_idx = u30() - kind_full = read_byte() - kind = kind_full & 0x0f - attrs = kind_full >> 4 - methods = {} - if kind in [0x00, 0x06]: # Slot or Const - u30() # Slot id - u30() # type_name_idx - vindex = u30() - if vindex != 0: - read_byte() # vkind - elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - u30() # disp_id - method_idx = u30() - methods[multinames[trait_name_idx]] = method_idx - elif kind == 0x04: # Class - u30() # slot_id - u30() # classi - elif kind == 0x05: # Function - u30() # slot_id - function_idx = u30() - methods[function_idx] = multinames[trait_name_idx] - else: - raise ExtractorError(u'Unsupported trait kind %d' % kind) - - if attrs & 0x4 != 0: # Metadata present - metadata_count = u30() - for _c3 in range(metadata_count): - u30() # metadata index - - return methods - - # Classes + swfi = SWFInterpreter(file_contents) TARGET_CLASSNAME = u'SignatureDecipher' - searched_idx = multinames.index(TARGET_CLASSNAME) - searched_class_id = None - class_count = u30() - for class_id in range(class_count): - name_idx = u30() - if name_idx == searched_idx: - # We found the class we're looking for! - searched_class_id = class_id - u30() # super_name idx - flags = read_byte() - if flags & 0x08 != 0: # Protected namespace is present - u30() # protected_ns_idx - intrf_count = u30() - for _c2 in range(intrf_count): - u30() - u30() # iinit - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - if searched_class_id is None: - raise ExtractorError(u'Target class %r not found' % - TARGET_CLASSNAME) - - method_names = {} - method_idxs = {} - for class_id in range(class_count): - u30() # cinit - trait_count = u30() - for _c2 in range(trait_count): - trait_methods = parse_traits_info() - if class_id == searched_class_id: - method_names.update(trait_methods.items()) - method_idxs.update(dict( - (idx, name) - for name, idx in trait_methods.items())) - - # Scripts - script_count = u30() - for _c in range(script_count): - u30() # init - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - # Method bodies - method_body_count = u30() - Method = collections.namedtuple('Method', ['code', 'local_count']) - methods = {} - for _c in range(method_body_count): - method_idx = u30() - u30() # max_stack - local_count = u30() - u30() # init_scope_depth - u30() # max_scope_depth - code_length = u30() - code = read_bytes(code_length) - if method_idx in method_idxs: - m = Method(code, local_count) - methods[method_idxs[method_idx]] = m - exception_count = u30() - for _c2 in range(exception_count): - u30() # from - u30() # to - u30() # target - u30() # exc_type - u30() # var_name - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - assert p + code_reader.tell() == len(code_tag) - assert len(methods) == len(method_idxs) - - method_pyfunctions = {} - - def extract_function(func_name): - if func_name in method_pyfunctions: - return method_pyfunctions[func_name] - if func_name not in methods: - raise ExtractorError(u'Cannot find function %r' % func_name) - m = methods[func_name] - - def resfunc(args): - registers = ['(this)'] + list(args) + [None] * m.local_count - stack = [] - coder = io.BytesIO(m.code) - while True: - opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 36: # pushbyte - v = struct.unpack('!B', coder.read(1))[0] - stack.append(v) - elif opcode == 44: # pushstring - idx = u30(coder) - stack.append(constant_strings[idx]) - elif opcode == 48: # pushscope - # We don't implement the scope register, so we'll just - # ignore the popped value - stack.pop() - elif opcode == 70: # callproperty - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'split': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, compat_str) - if args[0] == u'': - res = list(obj) - else: - res = obj.split(args[0]) - stack.append(res) - elif mname == u'slice': - assert len(args) == 1 - assert isinstance(args[0], int) - assert isinstance(obj, list) - res = obj[args[0]:] - stack.append(res) - elif mname == u'join': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, list) - res = args[0].join(obj) - stack.append(res) - elif mname in method_pyfunctions: - stack.append(method_pyfunctions[mname](args)) - else: - raise NotImplementedError( - u'Unsupported property %r on %r' - % (mname, obj)) - elif opcode == 72: # returnvalue - res = stack.pop() - return res - elif opcode == 79: # callpropvoid - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'reverse': - assert isinstance(obj, list) - obj.reverse() - else: - raise NotImplementedError( - u'Unsupported (void) property %r on %r' - % (mname, obj)) - elif opcode == 93: # findpropstrict - index = u30(coder) - mname = multinames[index] - res = extract_function(mname) - stack.append(res) - elif opcode == 97: # setproperty - index = u30(coder) - value = stack.pop() - idx = stack.pop() - obj = stack.pop() - assert isinstance(obj, list) - assert isinstance(idx, int) - obj[idx] = value - elif opcode == 98: # getlocal - index = u30(coder) - stack.append(registers[index]) - elif opcode == 99: # setlocal - index = u30(coder) - value = stack.pop() - registers[index] = value - elif opcode == 102: # getproperty - index = u30(coder) - pname = multinames[index] - if pname == u'length': - obj = stack.pop() - assert isinstance(obj, list) - stack.append(len(obj)) - else: # Assume attribute access - idx = stack.pop() - assert isinstance(idx, int) - obj = stack.pop() - assert isinstance(obj, list) - stack.append(obj[idx]) - elif opcode == 128: # coerce - u30(coder) - elif opcode == 133: # coerce_s - assert isinstance(stack[-1], (type(None), compat_str)) - elif opcode == 164: # modulo - value2 = stack.pop() - value1 = stack.pop() - res = value1 % value2 - stack.append(res) - elif opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 211: # getlocal_3 - stack.append(registers[3]) - elif opcode == 214: # setlocal_2 - registers[2] = stack.pop() - elif opcode == 215: # setlocal_3 - registers[3] = stack.pop() - else: - raise NotImplementedError( - u'Unsupported opcode %d' % opcode) - - method_pyfunctions[func_name] = resfunc - return resfunc - - initial_function = extract_function(u'decipher') + searched_class = swfi.extract_class(TARGET_CLASSNAME) + initial_function = swfi.extract_function(searched_class, u'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): @@ -1014,14 +606,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube - data = compat_urllib_parse.urlencode({'video_id': video_id, - 'el': 'player_embedded', - 'gl': 'US', - 'hl': 'en', - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'asv': 3, - 'sts':'1588', - }) + data = compat_urllib_parse.urlencode({ + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'sts':'16268', + }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, @@ -1220,31 +809,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url += '&signature=' + url_data['sig'][0] elif 's' in url_data: encrypted_sig = url_data['s'][0] + + if not age_gate: + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + player_url = json.loads(jsplayer_url_json) + if player_url is None: + player_url_json = self._search_regex( + r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', + video_webpage, u'age gate player URL') + player_url = json.loads(player_url_json) + if self._downloader.params.get('verbose'): - if age_gate: - if player_url is None: - player_version = 'unknown' - else: + if player_url is None: + player_version = 'unknown' + player_desc = 'unknown' + else: + if player_url.endswith('swf'): player_version = self._search_regex( - r'-(.+)\.swf$', player_url, + r'-(.+?)(?:/watch_as3)?\.swf$', player_url, u'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - r'html5player-(.+?)\.js', video_webpage, - 'html5 player', fatal=False) - player_desc = u'html5 player %s' % player_version + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + r'html5player-(.+?)\.js', video_webpage, + 'html5 player', fatal=False) + player_desc = u'html5 player %s' % player_version parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) - if not age_gate: - jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, u'JS player URL') - player_url = json.loads(jsplayer_url_json) - signature = self._decrypt_signature( encrypted_sig, video_id, player_url, age_gate) url += '&signature=' + signature diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py new file mode 100644 index 000000000..b63c65b20 --- /dev/null +++ b/youtube_dl/swfinterp.py @@ -0,0 +1,609 @@ +from __future__ import unicode_literals + +import collections +import io +import zlib + +from .utils import ( + compat_str, + ExtractorError, + struct_unpack, +) + + +def _extract_tags(file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + 'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError( + 'Unsupported compression format %r' % + file_contents[:1]) + + # Determine number of bits in framesize rectangle + framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3 + framesize_len = (5 + 4 * framesize_nbits + 7) // 8 + + pos = framesize_len + 2 + 2 + while pos < len(content): + header16 = struct_unpack('<H', content[pos:pos + 2])[0] + pos += 2 + tag_code = header16 >> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct_unpack('<I', content[pos:pos + 4])[0] + pos += 4 + assert pos + tag_len <= len(content), \ + ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' + % (tag_code, pos, tag_len, len(content))) + yield (tag_code, content[pos:pos + tag_len]) + pos += tag_len + + +class _AVMClass_Object(object): + def __init__(self, avm_class): + self.avm_class = avm_class + + def __repr__(self): + return '%s#%x' % (self.avm_class.name, id(self)) + + +class _ScopeDict(dict): + def __init__(self, avm_class): + super(_ScopeDict, self).__init__() + self.avm_class = avm_class + + def __repr__(self): + return '%s__Scope(%s)' % ( + self.avm_class.name, + super(_ScopeDict, self).__repr__()) + + +class _AVMClass(object): + def __init__(self, name_idx, name): + self.name_idx = name_idx + self.name = name + self.method_names = {} + self.method_idxs = {} + self.methods = {} + self.method_pyfunctions = {} + + self.variables = _ScopeDict(self) + + def make_object(self): + return _AVMClass_Object(self) + + def __repr__(self): + return '_AVMClass(%s)' % (self.name) + + def register_methods(self, methods): + self.method_names.update(methods.items()) + self.method_idxs.update(dict( + (idx, name) + for name, idx in methods.items())) + + +class _Multiname(object): + def __init__(self, kind): + self.kind = kind + + def __repr__(self): + return '[MULTINAME kind: 0x%x]' % self.kind + + +def _read_int(reader): + res = 0 + shift = 0 + for _ in range(5): + buf = reader.read(1) + assert len(buf) == 1 + b = struct_unpack('<B', buf)[0] + res = res | ((b & 0x7f) << shift) + if b & 0x80 == 0: + break + shift += 7 + return res + + +def _u30(reader): + res = _read_int(reader) + assert res & 0xf0000000 == 0 + return res +_u32 = _read_int + + +def _s32(reader): + v = _read_int(reader) + if v & 0x80000000 != 0: + v = - ((v ^ 0xffffffff) + 1) + return v + + +def _s24(reader): + bs = reader.read(3) + assert len(bs) == 3 + last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' + return struct_unpack('<i', bs + last_byte)[0] + + +def _read_string(reader): + slen = _u30(reader) + resb = reader.read(slen) + assert len(resb) == slen + return resb.decode('utf-8') + + +def _read_bytes(count, reader): + assert count >= 0 + resb = reader.read(count) + assert len(resb) == count + return resb + + +def _read_byte(reader): + resb = _read_bytes(1, reader=reader) + res = struct_unpack('<B', resb)[0] + return res + + +class SWFInterpreter(object): + def __init__(self, file_contents): + code_tag = next(tag + for tag_code, tag in _extract_tags(file_contents) + if tag_code == 82) + p = code_tag.index(b'\0', 4) + 1 + code_reader = io.BytesIO(code_tag[p:]) + + # Parse ABC (AVM2 ByteCode) + + # Define a couple convenience methods + u30 = lambda *args: _u30(*args, reader=code_reader) + s32 = lambda *args: _s32(*args, reader=code_reader) + u32 = lambda *args: _u32(*args, reader=code_reader) + read_bytes = lambda *args: _read_bytes(*args, reader=code_reader) + read_byte = lambda *args: _read_byte(*args, reader=code_reader) + + # minor_version + major_version + read_bytes(2 + 2) + + # Constant pool + int_count = u30() + for _c in range(1, int_count): + s32() + uint_count = u30() + for _c in range(1, uint_count): + u32() + double_count = u30() + read_bytes(max(0, (double_count - 1)) * 8) + string_count = u30() + self.constant_strings = [''] + for _c in range(1, string_count): + s = _read_string(code_reader) + self.constant_strings.append(s) + namespace_count = u30() + for _c in range(1, namespace_count): + read_bytes(1) # kind + u30() # name + ns_set_count = u30() + for _c in range(1, ns_set_count): + count = u30() + for _c2 in range(count): + u30() + multiname_count = u30() + MULTINAME_SIZES = { + 0x07: 2, # QName + 0x0d: 2, # QNameA + 0x0f: 1, # RTQName + 0x10: 1, # RTQNameA + 0x11: 0, # RTQNameL + 0x12: 0, # RTQNameLA + 0x09: 2, # Multiname + 0x0e: 2, # MultinameA + 0x1b: 1, # MultinameL + 0x1c: 1, # MultinameLA + } + self.multinames = [''] + for _c in range(1, multiname_count): + kind = u30() + assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind + if kind == 0x07: + u30() # namespace_idx + name_idx = u30() + self.multinames.append(self.constant_strings[name_idx]) + else: + self.multinames.append(_Multiname(kind)) + for _c2 in range(MULTINAME_SIZES[kind]): + u30() + + # Methods + method_count = u30() + MethodInfo = collections.namedtuple( + 'MethodInfo', + ['NEED_ARGUMENTS', 'NEED_REST']) + method_infos = [] + for method_id in range(method_count): + param_count = u30() + u30() # return type + for _ in range(param_count): + u30() # param type + u30() # name index (always 0 for youtube) + flags = read_byte() + if flags & 0x08 != 0: + # Options present + option_count = u30() + for c in range(option_count): + u30() # val + read_bytes(1) # kind + if flags & 0x80 != 0: + # Param names present + for _ in range(param_count): + u30() # param name + mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) + method_infos.append(mi) + + # Metadata + metadata_count = u30() + for _c in range(metadata_count): + u30() # name + item_count = u30() + for _c2 in range(item_count): + u30() # key + u30() # value + + def parse_traits_info(): + trait_name_idx = u30() + kind_full = read_byte() + kind = kind_full & 0x0f + attrs = kind_full >> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[self.multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = self.multinames[trait_name_idx] + else: + raise ExtractorError('Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + class_count = u30() + classes = [] + for class_id in range(class_count): + name_idx = u30() + + cname = self.multinames[name_idx] + avm_class = _AVMClass(name_idx, cname) + classes.append(avm_class) + + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + avm_class.register_methods(trait_methods) + + assert len(classes) == class_count + self._classes_by_name = dict((c.name, c) for c in classes) + + for avm_class in classes: + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + avm_class.register_methods(trait_methods) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + for avm_class in classes: + if method_idx in avm_class.method_idxs: + m = Method(code, local_count) + avm_class.methods[avm_class.method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + + def extract_class(self, class_name): + try: + return self._classes_by_name[class_name] + except KeyError: + raise ExtractorError('Class %r not found' % class_name) + + def extract_function(self, avm_class, func_name): + if func_name in avm_class.method_pyfunctions: + return avm_class.method_pyfunctions[func_name] + if func_name in self._classes_by_name: + return self._classes_by_name[func_name].make_object() + if func_name not in avm_class.methods: + raise ExtractorError('Cannot find function %s.%s' % ( + avm_class.name, func_name)) + m = avm_class.methods[func_name] + + def resfunc(args): + # Helper functions + coder = io.BytesIO(m.code) + s24 = lambda: _s24(coder) + u30 = lambda: _u30(coder) + + registers = [avm_class.variables] + list(args) + [None] * m.local_count + stack = [] + scopes = collections.deque([ + self._classes_by_name, avm_class.variables]) + while True: + opcode = _read_byte(coder) + if opcode == 17: # iftrue + offset = s24() + value = stack.pop() + if value: + coder.seek(coder.tell() + offset) + elif opcode == 18: # iffalse + offset = s24() + value = stack.pop() + if not value: + coder.seek(coder.tell() + offset) + elif opcode == 36: # pushbyte + v = _read_byte(coder) + stack.append(v) + elif opcode == 42: # dup + value = stack[-1] + stack.append(value) + elif opcode == 44: # pushstring + idx = u30() + stack.append(self.constant_strings[idx]) + elif opcode == 48: # pushscope + new_scope = stack.pop() + scopes.append(new_scope) + elif opcode == 66: # construct + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + res = obj.avm_class.make_object() + stack.append(res) + elif opcode == 70: # callproperty + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + + if isinstance(obj, _AVMClass_Object): + func = self.extract_function(obj.avm_class, mname) + res = func(args) + stack.append(res) + continue + elif isinstance(obj, _ScopeDict): + if mname in obj.avm_class.method_names: + func = self.extract_function(obj.avm_class, mname) + res = func(args) + else: + res = obj[mname] + stack.append(res) + continue + elif isinstance(obj, compat_str): + if mname == 'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + if args[0] == '': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + continue + elif isinstance(obj, list): + if mname == 'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + res = obj[args[0]:] + stack.append(res) + continue + elif mname == 'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + res = args[0].join(obj) + stack.append(res) + continue + raise NotImplementedError( + 'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 74: # constructproperty + index = u30() + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + + mname = self.multinames[index] + assert isinstance(obj, _AVMClass) + + # We do not actually call the constructor for now; + # we just pretend it does nothing + stack.append(obj.make_object()) + elif opcode == 79: # callpropvoid + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + 'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 86: # newarray + arg_count = u30() + arr = [] + for i in range(arg_count): + arr.append(stack.pop()) + arr = arr[::-1] + stack.append(arr) + elif opcode == 93: # findpropstrict + index = u30() + mname = self.multinames[index] + for s in reversed(scopes): + if mname in s: + res = s + break + else: + res = scopes[0] + stack.append(res[mname]) + elif opcode == 94: # findproperty + index = u30() + mname = self.multinames[index] + for s in reversed(scopes): + if mname in s: + res = s + break + else: + res = avm_class.variables + stack.append(res) + elif opcode == 96: # getlex + index = u30() + mname = self.multinames[index] + for s in reversed(scopes): + if mname in s: + scope = s + break + else: + scope = avm_class.variables + # I cannot find where static variables are initialized + # so let's just return None + res = scope.get(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30() + value = stack.pop() + idx = self.multinames[index] + if isinstance(idx, _Multiname): + idx = stack.pop() + obj = stack.pop() + obj[idx] = value + elif opcode == 98: # getlocal + index = u30() + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30() + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30() + pname = self.multinames[index] + if pname == 'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 115: # convert_ + value = stack.pop() + intvalue = int(value) + stack.append(intvalue) + elif opcode == 128: # coerce + u30() + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 160: # add + value2 = stack.pop() + value1 = stack.pop() + res = value1 + value2 + stack.append(res) + elif opcode == 161: # subtract + value2 = stack.pop() + value1 = stack.pop() + res = value1 - value2 + stack.append(res) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 175: # greaterequals + value2 = stack.pop() + value1 = stack.pop() + result = value1 >= value2 + stack.append(result) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 212: # setlocal_0 + registers[0] = stack.pop() + elif opcode == 213: # setlocal_1 + registers[1] = stack.pop() + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + 'Unsupported opcode %d' % opcode) + + avm_class.method_pyfunctions[func_name] = resfunc + return resfunc + diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 64a9618ca..3ecd798d7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -91,11 +91,9 @@ except ImportError: compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _unquote(string, encoding='utf-8', errors='replace'): + from urllib.parse import unquote as compat_urllib_parse_unquote +except ImportError: + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): if string == '': return string res = string.split('%') @@ -130,6 +128,13 @@ except ImportError: # Python 2 string += pct_sequence.decode(encoding, errors) return string + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'): qs, _coerce_result = qs, unicode @@ -149,10 +154,12 @@ except ImportError: # Python 2 continue if len(nv[1]) or keep_blank_values: name = nv[0].replace('+', ' ') - name = _unquote(name, encoding=encoding, errors=errors) + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) name = _coerce_result(name) value = nv[1].replace('+', ' ') - value = _unquote(value, encoding=encoding, errors=errors) + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) value = _coerce_result(value) r.append((name, value)) return r @@ -1193,11 +1200,6 @@ def format_bytes(bytes): return u'%.2f%s' % (converted, suffix) -def str_to_int(int_str): - int_str = re.sub(r'[,\.]', u'', int_str) - return int(int_str) - - def get_term_width(): columns = os.environ.get('COLUMNS', None) if columns: @@ -1265,15 +1267,22 @@ class HEADRequest(compat_urllib_request.Request): return "HEAD" -def int_or_none(v, scale=1, default=None, get_attr=None): +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) - return default if v is None else (int(v) // scale) + return default if v is None else (int(v) * invscale // scale) + + +def str_to_int(int_str): + if int_str is None: + return None + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) -def float_or_none(v, scale=1, default=None): - return default if v is None else (float(v) / scale) +def float_or_none(v, scale=1, invscale=1, default=None): + return default if v is None else (float(v) * invscale / scale) def parse_duration(s): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4d606c3d2..e5fcec839 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.15' +__version__ = '2014.07.22' |