diff options
41 files changed, 1287 insertions, 413 deletions
@@ -127,3 +127,4 @@ Julian Richen Ping O. Mister Hat Peter Ding +jackyzy823 @@ -52,7 +52,7 @@ which means you can modify it, redistribute it or use it however you like. -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors and the URLs they would handle + --list-extractors List all supported extractors --extractor-descriptions Output descriptions of all supported extractors --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The @@ -223,7 +223,7 @@ which means you can modify it, redistribute it or use it however you like. parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; + --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d147b53fe..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -120,6 +120,8 @@ - **divxstage**: DivxStage - **Dotsub** - **DouyuTV** + - **dramafever** + - **dramafever:series** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -153,6 +155,7 @@ - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - **Firstpost** + - **FiveTV** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** @@ -217,6 +220,7 @@ - **instagram:user**: Instagram user profile - **InternetVideoArchive** - **IPrima** + - **iqiyi** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -407,6 +411,7 @@ - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU + - **Ruutu** - **safari**: safaribooksonline.com online video - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories @@ -519,6 +524,8 @@ - **TV2** - **TV2Article** - **TV4**: tv4.se and tv4play.se + - **TVC** + - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a7d3a1c01..ef0f71bad 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -119,7 +119,7 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. - videopassword: Password for acces a video. + videopassword: Password for accessing a video. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. @@ -1037,12 +1037,6 @@ class YoutubeDL(object): info_dict['id'], info_dict.get('subtitles'), info_dict.get('automatic_captions')) - # This extractors handle format selection themselves - if info_dict['extractor'] in ['Youku']: - if download: - self.process_info(info_dict) - return info_dict - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3bc62e9d5..dc1a302e6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,7 +4,10 @@ from .abc import ABCIE from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE -from .adobetv import AdobeTVIE +from .adobetv import ( + AdobeTVIE, + AdobeTVVideoIE, +) from .adultswim import AdultSwimIE from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE @@ -103,6 +106,7 @@ from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, DailymotionUserIE, + DailymotionCloudIE, ) from .daum import DaumIE from .dbtv import DBTVIE @@ -112,6 +116,10 @@ from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE @@ -397,6 +405,7 @@ from .pbs import PBSIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE @@ -692,7 +701,10 @@ from .wrzuta import WrzutaIE from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE -from .xhamster import XHamsterIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 97d128560..5e43adc51 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -5,6 +5,8 @@ from ..utils import ( parse_duration, unified_strdate, str_to_int, + float_or_none, + ISO639Utils, ) @@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class AdobeTVVideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + + _TEST = { + # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners + 'url': 'https://video.tv.adobe.com/v/2456/', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), + video_id) + + formats = [{ + 'url': source['src'], + 'width': source.get('width'), + 'height': source.get('height'), + 'tbr': source.get('bitrate'), + } for source in player_params['sources']] + + # For both metadata and downloaded files the duration varies among + # formats. I just pick the max one + duration = max(filter(None, [ + float_or_none(source.get('duration'), scale=1000) + for source in player_params['sources']])) + + subtitles = {} + for translation in player_params.get('translations', []): + lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + if lang_id not in subtitles: + subtitles[lang_id] = [] + subtitles[lang_id].append({ + 'url': translation['vttPath'], + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': player_params['title'], + 'description': self._og_search_description(webpage), + 'duration': duration, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 249bc6bbd..5825d2867 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -130,6 +130,20 @@ class BBCCoUkIE(InfoExtractor): }, 'skip': 'geolocation', }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, }, { @@ -237,26 +251,11 @@ class BBCCoUkIE(InfoExtractor): for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) - srt = '' - - def _extract_text(p): - if p.text is not None: - stripped_text = p.text.strip() - if stripped_text: - return stripped_text - return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) - for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) subtitles[lang] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, - { - 'data': srt, - 'ext': 'srt', - }, ] return subtitles @@ -267,7 +266,7 @@ class BBCCoUkIE(InfoExtractor): programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) else: raise @@ -362,7 +361,7 @@ class BBCCoUkIE(InfoExtractor): formats, subtitles = self._download_media_selector(programme_id) title = self._og_search_title(webpage) description = self._search_regex( - r'<p class="medium-description">([^<]+)</p>', + r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', webpage, 'description', fatal=False) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c1d4320e1..4721c2293 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..utils import ( determine_ext, @@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor): try: object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) - except xml.etree.ElementTree.ParseError: + except compat_xml_parse_error: return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') @@ -156,6 +157,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove <object /> XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID + ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P<videoID>\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -188,7 +211,12 @@ class BrightcoveIE(InfoExtractor): [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ ).+?>\s*</object>''', webpage) - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 70aa4333c..96f0ed9ad 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -251,3 +251,45 @@ class DailymotionUserIE(DailymotionPlaylistIE): 'title': full_user, 'entries': self._extract_entries(user), } + + +class DailymotionCloudIE(DailymotionBaseInfoExtractor): + _VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P<id>[^/?]+)' + + _TEST = { + # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html + # Tested at FranceTvInfo_2 + 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', + 'only_matching': True, + } + + @classmethod + def _extract_dmcloud_url(self, webpage): + mobj = re.search(r'<iframe[^>]+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage) + if mobj: + return mobj.group(1) + + mobj = re.search(r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage) + if mobj: + return mobj.group(1) + + def _real_extract(self, url): + video_id = self._match_id(url) + + request = self._build_request(url) + webpage = self._download_webpage(request, video_id) + + title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') + + video_info = self._parse_json(self._search_regex( + r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) + + # TODO: parse ios_url, which is in fact a manifest + video_url = video_info['mp4_url'] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': video_info.get('thumbnail_url'), + } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py new file mode 100644 index 000000000..ca41a3abf --- /dev/null +++ b/youtube_dl/extractor/dramafever.py @@ -0,0 +1,197 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) + + +class DramaFeverBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' + _NETRC_MACHINE = 'dramafever' + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'username': username, + 'password': password, + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if all(logout_pattern not in response + for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): + error = self._html_search_regex( + r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class DramaFeverIE(DramaFeverBaseIE): + IE_NAME = 'dramafever' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' + _TEST = { + 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512.1', + 'ext': 'flv', + 'title': 'Cooking with Shin 4512.1', + 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1404336058, + 'upload_date': '20140702', + 'duration': 343, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url).replace('/', '.') + + try: + feed = self._download_json( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, + video_id, 'Downloading episode JSON')['channel']['item'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + raise ExtractorError( + 'Currently unavailable in your country.', expected=True) + raise + + media_group = feed.get('media-group', {}) + + formats = [] + for media_content in media_group['media-content']: + src = media_content.get('@attributes', {}).get('url') + if not src: + continue + ext = determine_ext(src) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src, video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id='hls')) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + title = media_group.get('media-title') + description = media_group.get('media-description') + duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) + thumbnail = self._proto_relative_url( + media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) + timestamp = parse_iso8601(feed.get('pubDate'), ' ') + + subtitles = {} + for media_subtitle in media_group.get('media-subTitle', []): + lang = media_subtitle.get('@attributes', {}).get('lang') + href = media_subtitle.get('@attributes', {}).get('href') + if not lang or not href: + continue + subtitles[lang] = [{ + 'ext': 'ttml', + 'url': href, + }] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class DramaFeverSeriesIE(DramaFeverBaseIE): + IE_NAME = 'dramafever:series' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' + _TESTS = [{ + 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512', + 'title': 'Cooking with Shin', + 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.dramafever.com/drama/124/IRIS/', + 'info_dict': { + 'id': '124', + 'title': 'IRIS', + 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862', + }, + 'playlist_count': 20, + }] + + _CONSUMER_SECRET = 'DA59dtVXYLxajktV' + _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) + + def _get_consumer_secret(self, video_id): + mainjs = self._download_webpage( + 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', + video_id, 'Downloading main.js', fatal=False) + if not mainjs: + return self._CONSUMER_SECRET + return self._search_regex( + r"var\s+cs\s*=\s*'([^']+)'", mainjs, + 'consumer secret', default=self._CONSUMER_SECRET) + + def _real_extract(self, url): + series_id = self._match_id(url) + + consumer_secret = self._get_consumer_secret(series_id) + + series = self._download_json( + 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s' + % (consumer_secret, series_id), + series_id, 'Downloading series JSON')['series'][series_id] + + title = clean_html(series['name']) + description = clean_html(series.get('description') or series.get('description_short')) + + entries = [] + for page_num in itertools.count(1): + episodes = self._download_json( + 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d' + % (consumer_secret, series_id, self._PAGE_SIZE, page_num), + series_id, 'Downloading episodes JSON page #%d' % page_num) + for episode in episodes.get('value', []): + episode_url = episode.get('episode_url') + if not episode_url: + continue + entries.append(self.url_result( + compat_urlparse.urljoin(url, episode_url), + 'DramaFever', episode.get('guid'))) + if page_num == episodes['num_pages']: + break + + return self.playlist_result(entries, series_id, title, description) diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 7626219ba..8b98b013a 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', - 'md5': 'fe330252ddea607635cf2eb2c99a0af3', 'info_dict': { 'id': '65517', 'ext': 'mp4', @@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor): 'upload_date': '20110120', 'duration': 3664, }, + 'params': { + 'skip_download': True, # requires rtmp + }, }, { 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', 'md5': '6dfe039417e76795fb783c52da3de11d', @@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor): 'format_id': file['Type'].replace('Video', ''), 'preference': preferencemap.get(file['Type'], -10), }) + if format['url'].startswith('rtmp'): + rtmp_url = format['url'] + format['rtmp_live'] = True # --resume does not work + if '/bonanza/' in rtmp_url: + format['play_path'] = rtmp_url.split('/bonanza/')[1] formats.append(format) elif file['Type'] == "Thumb": thumbnail = file['Location'] @@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor): description = '%s\n%s\n%s\n' % ( info['Description'], info['Actors'], info['Colophon']) - for f in formats: - f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/') - f['url'] = f['url'].replace('mp4:bonanza', 'bonanza') self._sort_formats(formats) display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 3c39ca451..cebdd0193 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -6,9 +6,9 @@ from .common import InfoExtractor class FazIE(InfoExtractor): IE_NAME = 'faz.net' - _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', 'info_dict': { 'id': '12610585', @@ -16,7 +16,22 @@ class FazIE(InfoExtractor): 'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', 'description': 'md5:1453fbf9a0d041d985a47306192ea253', }, - } + }, { + 'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/aktuell/politik/-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/foobarblafasel-13659345.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index edf555b29..b2c984bf2 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -18,6 +18,7 @@ from ..utils import ( parse_duration, determine_ext, ) +from .dailymotion import DailymotionCloudIE class FranceTVBaseInfoExtractor(InfoExtractor): @@ -60,7 +61,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): continue video_url_parsed = compat_urllib_parse_urlparse(video_url) f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path, + 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url_parsed.path, video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) @@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'skip_download': 'HLS (reqires ffmpeg)' }, 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', + }, { + 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': '556e03339473995ee145930c', + 'ext': 'mp4', + 'title': 'Les entreprises familiales : le secret de la réussite', + 'thumbnail': 're:^https?://.*\.jpe?g$', + } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) + + dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) + if dmcloud_url: + return self.url_result(dmcloud_url, 'DailymotionCloud') + video_id, catalogue = self._search_regex( r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c8582bda9..7769ffc5c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -42,6 +42,10 @@ from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE from .svt import SVTIE +from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE +from .vimeo import VimeoIE +from .dailymotion import DailymotionCloudIE class GenericIE(InfoExtractor): @@ -332,6 +336,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # XHamster embed + { + 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', + 'info_dict': { + 'id': 'showthread', + 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', + }, + 'playlist_mincount': 7, + }, # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', @@ -811,6 +824,29 @@ class GenericIE(InfoExtractor): 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', }, + }, + # Dailymotion Cloud video + { + 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', + 'md5': '49444254273501a64675a7e68c502681', + 'info_dict': { + 'id': '5585de919473990de4bee11b', + 'ext': 'mp4', + 'title': 'Le débat', + 'thumbnail': 're:^https?://.*\.jpe?g$', + } + }, + # AdobeTVVideo embed + { + 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, } ] @@ -1090,18 +1126,9 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, ie='RtlNl') - # Look for embedded (iframe) Vimeo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) - if mobj: - player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'Referer': url}) - return self.url_result(surl) - # Look for embedded (swf embed) Vimeo player - mobj = re.search( - r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) - if mobj: - return self.url_result(mobj.group(1)) + vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) + if vimeo_url is not None: + return self.url_result(vimeo_url) # Look for embedded YouTube player matches = re.findall(r'''(?x) @@ -1323,6 +1350,16 @@ class GenericIE(InfoExtractor): if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + # Look for embedded PornHub player + pornhub_url = PornHubIE._extract_url(webpage) + if pornhub_url: + return self.url_result(pornhub_url, 'PornHub') + + # Look for embedded XHamster player + xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) + if xhamster_urls: + return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -1490,6 +1527,20 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') + # Look for Dailymotion Cloud videos + dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) + if dmcloud_url: + return self.url_result(dmcloud_url, 'DailymotionCloud') + + # Look for AdobeTVVideo embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), + 'AdobeTVVideo') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f29df36b5..4bb574cf3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor): format_info = info['videoPlayerObject']['video'] formats.append({ 'format_id': f_id, - 'url': format_info['url'], + 'url': format_info['videoInfoList'][0]['videoUrl'], }) return { diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 42cb6e35f..f8cbca7b3 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -8,6 +8,7 @@ from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, + remove_end, unified_strdate, ExtractorError, ) @@ -39,7 +40,6 @@ class LifeNewsIE(InfoExtractor): 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ', 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', 'upload_date': '20150402', - 'uploader': 'embed.life.ru', } }, { 'url': 'http://lifenews.ru/news/153461', @@ -50,7 +50,6 @@ class LifeNewsIE(InfoExtractor): 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', 'upload_date': '20150505', - 'uploader': 'embed.life.ru', } }, { 'url': 'http://lifenews.ru/video/13035', @@ -72,20 +71,20 @@ class LifeNewsIE(InfoExtractor): if not videos and not iframe_link: raise ExtractorError('No media links available for %s' % video_id) - title = self._og_search_title(webpage) - TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS' - if title.endswith(TITLE_SUFFIX): - title = title[:-len(TITLE_SUFFIX)] + title = remove_end( + self._og_search_title(webpage), + ' - Первый по срочным новостям — LIFE | NEWS') description = self._og_search_description(webpage) view_count = self._html_search_regex( r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False) comment_count = self._html_search_regex( - r'<div class=\'comments\'>\s*<span class=\'counter\'>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False) + r'=\'commentCount\'[^>]*>\s*(\d+)\s*<', + webpage, 'comment count', fatal=False) upload_date = self._html_search_regex( - r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False) + r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False) if upload_date is not None: upload_date = unified_strdate(upload_date) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 35822067f..857edfde2 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor): 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, } + }, { + # Covers https://github.com/rg3/youtube-dl/pull/5983 + 'url': 'http://www.liveleak.com/view?i=801_1409392012', + 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', + 'info_dict': { + 'id': '801_1409392012', + 'ext': 'mp4', + 'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.", + 'uploader': 'bony333', + 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' + } }] def _real_extract(self, url): @@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor): 'url': s['file'], } for i, s in enumerate(sources)] for i, s in enumerate(sources): - orig_url = s['file'].replace('.h264_base.mp4', '') + # Removing '.h264_*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) + orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) if s['file'] != orig_url: formats.append({ 'format_id': 'original-%s' % i, diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 3cecebf95..0f8aa5ada 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -182,7 +182,6 @@ class NiconicoIE(InfoExtractor): extension = xpath_text(video_info, './/movie_type') if not extension: extension = determine_ext(video_real_url) - video_format = extension.upper() thumbnail = ( xpath_text(video_info, './/thumbnail_url') or @@ -241,7 +240,7 @@ class NiconicoIE(InfoExtractor): 'url': video_real_url, 'title': title, 'ext': extension, - 'format': video_format, + 'format_id': 'economy' if video_real_url.endswith('low') else 'normal', 'thumbnail': thumbnail, 'description': description, 'uploader': uploader, diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 5bbd2dcf6..a53e27b27 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -195,7 +195,7 @@ class NocoIE(InfoExtractor): if episode_number: title += ' #' + compat_str(episode_number) if episode: - title += ' - ' + episode + title += ' - ' + compat_str(episode) description = show.get('show_resume') or show.get('family_resume') diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 000000000..a52210fab --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start, + str_to_int, + unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 100, + 'upload_date': '20150406', + 'uploader': 'revelco', + 'location': 'Victoria, British Columbia, Canada', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.pinkbike.com/video/%s' % video_id, video_id) + + formats = [] + for _, format_id, src in re.findall( + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + }) + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') + description = self._html_search_regex( + r'(?s)id="media-description"[^>]*>(.+?)<', + webpage, 'description', default=None) or remove_start( + self._og_search_description(webpage), title + '. ') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + uploader = self._search_regex( + r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class="fullTime"[^>]+title="([^"]+)"', + webpage, 'upload date', fatal=False)) + + location = self._html_search_regex( + r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', + webpage, 'location', fatal=False) + + def extract_count(webpage, label): + return str_to_int(self._search_regex( + r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, + webpage, label, fatal=False)) + + view_count = extract_count(webpage, 'Views') + comment_count = extract_count(webpage, 'Comments') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'location': location, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index daa284ea2..8172bc997 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -19,8 +19,8 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' + _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '882f488fa1f0026f023f33576004a2ed', 'info_dict': { @@ -30,7 +30,17 @@ class PornHubIE(InfoExtractor): "title": "Seductive Indian beauty strips down and fingers her pink pussy", "age_limit": 18 } - } + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', + 'only_matching': True, + }] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) + if mobj: + return mobj.group('url') def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex( @@ -39,7 +49,8 @@ class PornHubIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = compat_urllib_request.Request(url) + req = compat_urllib_request.Request( + 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 255d4abc1..536a42dc8 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -177,6 +177,7 @@ class ProSiebenSat1IE(InfoExtractor): r'<header class="clearfix">\s*<h3>(.+?)</h3>', r'<!-- start video -->\s*<h1>(.+?)</h1>', r'<h1 class="att-name">\s*(.+?)</h1>', + r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', @@ -206,8 +207,8 @@ class ProSiebenSat1IE(InfoExtractor): def _extract_clip(self, url, webpage): clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') - access_token = 'testclient' - client_name = 'kolibri-1.2.5' + access_token = 'prosieben' + client_name = 'kolibri-1.12.6' client_location = url videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ @@ -275,13 +276,17 @@ class ProSiebenSat1IE(InfoExtractor): for source in urls_sources: protocol = source['protocol'] if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url']) + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source['url']) if not mobj: continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] formats.append({ - 'url': mobj.group('url'), - 'app': mobj.group('app'), - 'play_path': mobj.group('playpath'), + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', 'page_url': 'http://www.prosieben.de', 'vbr': fix_bitrate(source['bitrate']), diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 10251f29e..f3c80708c 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -83,7 +83,7 @@ class SafariIE(SafariBaseIE): library/view/[^/]+| api/v1/book )/ - (?P<course_id>\d+)/ + (?P<course_id>[^/]+)/ (?:chapter(?:-content)?/)? (?P<part>part\d+)\.html ''' @@ -100,6 +100,10 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'only_matching': True, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -122,7 +126,7 @@ class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 29bd9ce6f..ba2d5e19b 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -6,9 +6,12 @@ import re from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_request + compat_urllib_request, + compat_urllib_parse, +) +from ..utils import ( + ExtractorError, ) -from ..utils import ExtractorError class SohuIE(InfoExtractor): @@ -26,7 +29,7 @@ class SohuIE(InfoExtractor): 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', + 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -34,7 +37,7 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '49308ff6dafde5ece51137d04aec311e', + 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -48,7 +51,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': '492923eac023ba2f13ff69617c32754a', + 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -56,7 +59,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': 'de604848c0e8e9c4a4dde7e1347c0637', + 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -64,7 +67,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '93584716ee0657c0b205b8aa3d27aa13', + 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', @@ -139,21 +142,42 @@ class SohuIE(InfoExtractor): for i in range(part_count): formats = [] for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] - # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable - # so retry until got a working URL video_url = 'newflv.sohu.ccgslb.net' + cdnId = None retries = 0 - while 'newflv.sohu.ccgslb.net' in video_url and retries < 5: - download_note = 'Download information from CDN gateway for format ' + format_id + + while 'newflv.sohu.ccgslb.net' in video_url: + params = { + 'prot': 9, + 'file': clips_url[i], + 'new': su[i], + 'prod': 'flash', + } + + if cdnId is not None: + params['idc'] = cdnId + + download_note = 'Downloading %s video URL part %d of %d' % ( + format_id, i + 1, part_count) + if retries > 0: download_note += ' (retry #%d)' % retries + part_info = self._parse_json(self._download_webpage( + 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + video_id, download_note), video_id) + + video_url = part_info['url'] + cdnId = part_info.get('nid') + retries += 1 - cdn_info = self._download_json( - 'http://data.vod.itc.cn/cdnList?new=' + data['su'][i], - video_id, download_note) - video_url = cdn_info['url'] + if retries > 5: + raise ExtractorError('Failed to get video URL') formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 06d6e6640..bff75d6b2 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -27,7 +27,7 @@ class SpankwireIE(InfoExtractor): 'description': 'Crazy Bitch X rated music video.', 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070508', + 'upload_date': '20070507', 'age_limit': 18, } } @@ -44,7 +44,7 @@ class SpankwireIE(InfoExtractor): title = self._html_search_regex( r'<h1>([^<]+)', webpage, 'title') description = self._html_search_regex( - r'<div\s+id="descriptionContent">([^<]+)<', + r'(?s)<div\s+id="descriptionContent">(.+?)</div>', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', @@ -64,12 +64,12 @@ class SpankwireIE(InfoExtractor): r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( - r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>', + r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>', webpage, 'comment count', fatal=False)) video_urls = list(map( compat_urllib_parse.unquote, - re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) + re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: password = self._search_regex( r'flashvars\.video_title = "([^"]+)', diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index e6218808f..9ead13a91 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .pornhub import PornHubIE +from .vimeo import VimeoIE class TumblrIE(InfoExtractor): @@ -39,6 +41,17 @@ class TumblrIE(InfoExtractor): 'timestamp': 1430931613, }, 'add_ie': ['Vidme'], + }, { + 'url': 'http://camdamage.tumblr.com/post/98846056295/', + 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', + 'info_dict': { + 'id': '105463834', + 'ext': 'mp4', + 'title': 'Cam Damage-HD 720p', + 'uploader': 'John Moyer', + 'uploader_id': 'user32021558', + }, + 'add_ie': ['Vimeo'], }] def _real_extract(self, url): @@ -55,6 +68,14 @@ class TumblrIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') + pornhub_url = PornHubIE._extract_url(webpage) + if pornhub_url: + return self.url_result(pornhub_url, 'PornHub') + + vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) + if vimeo_url: + return self.url_result(vimeo_url, 'Vimeo') + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url') diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 6b5d80aee..3a4f393fc 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -27,7 +27,7 @@ class TVCIE(InfoExtractor): @classmethod def _extract_url(cls, webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:http://)?(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index e83e31a31..79863e781 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor): viasat4play\.no/programmer| tv6play\.no/programmer| tv3play\.dk/programmer| + play\.novatv\.bg/programi )/[^/]+/(?P<id>\d+) ''' _TESTS = [ @@ -173,6 +174,22 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', + 'info_dict': { + 'id': '624952', + 'ext': 'flv', + 'title': 'Здравей, България (12.06.2015 г.) ', + 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', + 'duration': 8838, + 'timestamp': 1434100372, + 'upload_date': '20150612', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index dd026748d..722eb5236 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - redirect_page, urlh = self._download_webpage_handle(url, video_id) - new_location = self._search_regex(r'window\.location = \'(.*)\';', - redirect_page, 'redirect location') - redirect_url = urlh.geturl() + new_location - webpage = self._download_webpage(redirect_url, video_id, + # need to get the page 3 times for the correct jsSecretToken cookie + # which is necessary for the correct title + def get_session_id(): + redirect_page = self._download_webpage(url, video_id) + session_id_url = self._search_regex( + r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, + 'session id url') + self._download_webpage( + compat_urlparse.urljoin(url, session_id_url), video_id, + 'Getting session id') + + get_session_id() + get_session_id() + + webpage = self._download_webpage(url, video_id, 'Downloading redirect page') title = self._html_search_regex(r'<title>(.*)</title>', diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 7f2fb1ca8..51cdc6b65 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,5 +1,7 @@ +# coding: utf-8 from __future__ import unicode_literals +import json import time import hmac import hashlib @@ -11,6 +13,7 @@ from ..utils import ( parse_age_limit, parse_iso8601, ) +from ..compat import compat_urllib_request from .common import InfoExtractor @@ -23,27 +26,35 @@ class VikiBaseIE(InfoExtractor): _APP_VERSION = '2.2.5.1428709186' _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' - def _prepare_call(self, path, timestamp=None): + _NETRC_MACHINE = 'viki' + + _token = None + + def _prepare_call(self, path, timestamp=None, post_data=None): path += '?' if '?' not in path else '&' if not timestamp: timestamp = int(time.time()) query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) + if self._token: + query += '&token=%s' % self._token sig = hmac.new( self._APP_SECRET.encode('ascii'), query.encode('ascii'), hashlib.sha1 ).hexdigest() - return self._API_URL_TEMPLATE % (query, sig) + url = self._API_URL_TEMPLATE % (query, sig) + return compat_urllib_request.Request( + url, json.dumps(post_data).encode('utf-8')) if post_data else url - def _call_api(self, path, video_id, note, timestamp=None): + def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note) error = resp.get('error') if error: if error == 'invalid timestamp': resp = self._download_json( - self._prepare_call(path, int(resp['current_timestamp'])), + self._prepare_call(path, int(resp['current_timestamp']), post_data), video_id, '%s (retry)' % note) error = resp.get('error') if error: @@ -56,6 +67,27 @@ class VikiBaseIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error), expected=True) + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'login_id': username, + 'password': password, + } + + login = self._call_api( + 'sessions.json', None, + 'Logging in as %s' % username, post_data=login_form) + + self._token = login.get('token') + if not self._token: + self.report_warning('Unable to get session token, login has probably failed') + class VikiIE(VikiBaseIE): IE_NAME = 'viki' diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f300c7ca4..cae90205d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -22,6 +22,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, urlencode_postdata, + unescapeHTML, ) @@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor): }, ] + @staticmethod + def _extract_vimeo_url(url, webpage): + # Look for embedded (iframe) Vimeo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) + if mobj: + player_url = unescapeHTML(mobj.group('url')) + surl = smuggle_url(player_url, {'Referer': url}) + return surl + # Look for embedded (swf embed) Vimeo player + mobj = re.search( + r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) + if mobj: + return mobj.group(1) + def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword', None) if password is None: diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index cc384adbf..38ff3c1a9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -13,6 +13,7 @@ from ..compat import ( from ..utils import ( ExtractorError, orderedSet, + str_to_int, unescapeHTML, unified_strdate, ) @@ -34,6 +35,7 @@ class VKIE(InfoExtractor): 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, 'upload_date': '20120212', + 'view_count': int, }, }, { @@ -45,7 +47,8 @@ class VKIE(InfoExtractor): 'uploader': 'Tom Cruise', 'title': 'No name', 'duration': 9, - 'upload_date': '20130721' + 'upload_date': '20130721', + 'view_count': int, } }, { @@ -59,6 +62,7 @@ class VKIE(InfoExtractor): 'title': 'Lin Dan', 'duration': 101, 'upload_date': '20120730', + 'view_count': int, } }, { @@ -73,7 +77,8 @@ class VKIE(InfoExtractor): 'uploader': 'Триллеры', 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 'duration': 8352, - 'upload_date': '20121218' + 'upload_date': '20121218', + 'view_count': int, }, 'skip': 'Requires vk account credentials', }, @@ -100,6 +105,7 @@ class VKIE(InfoExtractor): 'title': 'Книга Илая', 'duration': 6771, 'upload_date': '20140626', + 'view_count': int, }, 'skip': 'Only works from Russia', }, @@ -119,8 +125,8 @@ class VKIE(InfoExtractor): 'act': 'login', 'role': 'al_frame', 'expire': '1', - 'email': username, - 'pass': password, + 'email': username.encode('cp1251'), + 'pass': password.encode('cp1251'), } request = compat_urllib_request.Request('https://login.vk.com/?act=login', @@ -175,25 +181,29 @@ class VKIE(InfoExtractor): m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) - m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: - m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) + m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) if m_opts_url: opts_url = m_opts_url.group(1) if opts_url.startswith('//'): opts_url = 'http:' + opts_url return self.url_result(opts_url) - data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') + data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') data = json.loads(data_json) # Extract upload date upload_date = None - mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) + mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) if mobj is not None: mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) + view_count = str_to_int(self._search_regex( + r'"mv_views_count_number"[^>]*>([\d,.]+) views<', + info_page, 'view count', fatal=False)) + formats = [{ 'format_id': k, 'url': v, @@ -210,6 +220,7 @@ class VKIE(InfoExtractor): 'uploader': data.get('md_author'), 'duration': data.get('duration'), 'upload_date': upload_date, + 'view_count': view_count, } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 4527567f8..b4ad513a0 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,7 +13,6 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - """Information Extractor for xHamster""" _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' _TESTS = [ { @@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor): 'age_limit': age_limit, 'formats': formats, } + + +class XHamsterEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)' + _TEST = { + 'url': 'http://xhamster.com/xembed.php?video=3328539', + 'info_dict': { + 'id': '3328539', + 'ext': 'mp4', + 'title': 'Pen Masturbation', + 'upload_date': '20140728', + 'uploader_id': 'anonymous', + 'duration': 5, + 'age_limit': 18, + } + } + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, + webpage, 'xhamster url') + + return self.url_result(video_url, 'XHamster') diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 2a45dc574..d8415bed4 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,10 +5,12 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_request, ) from ..utils import ( clean_html, ExtractorError, + determine_ext, ) @@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor): } } + _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor): video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + formats = [{ + 'url': video_url, + }] + + android_req = compat_urllib_request.Request(url) + android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) + android_webpage = self._download_webpage(android_req, video_id, fatal=False) + + if android_webpage is not None: + player_params_str = self._search_regex( + 'mobileReplacePlayerDivTwoQual\(([^)]+)\)', + android_webpage, 'player parameters', default='') + player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) + if player_params: + formats.extend([{ + 'url': param, + 'preference': -10, + } for param in player_params if determine_ext(param) == 'mp4']) + + self._sort_formats(formats) + return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': video_title, 'ext': 'flv', 'thumbnail': video_thumbnail, diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 97b98bbe8..ced3a10cd 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,123 +1,235 @@ # coding: utf-8 - from __future__ import unicode_literals -import math -import random -import re -import time +import base64 from .common import InfoExtractor -from ..utils import ( - ExtractorError, +from ..utils import ExtractorError + +from ..compat import ( + compat_urllib_parse, + compat_ord, + compat_urllib_request, ) class YoukuIE(InfoExtractor): + IE_NAME = 'youku' _VALID_URL = r'''(?x) (?: http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| youku:) (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) ''' - _TEST = { - 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html', - 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b', - 'params': { - 'test': False + + _TESTS = [{ + 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', + 'md5': '5f3af4192eabacc4501508d54a8cabd7', + 'info_dict': { + 'id': 'XMTc1ODE5Njcy_part1', + 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', + 'ext': 'flv' + } + }, { + 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', + 'only_matching': True, + }, { + 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', + 'info_dict': { + 'id': 'XODgxNjg1Mzk2', + 'title': '武媚娘传奇 85', }, + 'playlist_count': 11, + }, { + 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', 'info_dict': { - 'id': 'XNDgyMDQ2NTQw_part00', - 'ext': 'flv', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐' + 'id': 'XMTI1OTczNDM5Mg', + 'title': '花千骨 04', + }, + 'playlist_count': 13, + 'skip': 'Available in China only', + }] + + def construct_video_urls(self, data1, data2): + # get sid, token + def yk_t(s1, s2): + ls = list(range(256)) + t = 0 + for i in range(256): + t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256 + ls[i], ls[t] = ls[t], ls[i] + s = bytearray() + x, y = 0, 0 + for i in range(len(s2)): + y = (y + 1) % 256 + x = (x + ls[y]) % 256 + ls[x], ls[y] = ls[y], ls[x] + s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) + return bytes(s) + + sid, token = yk_t( + b'becaf9be', base64.b64decode(data2['ep'].encode('ascii')) + ).decode('ascii').split('_') + + # get oip + oip = data2['ip'] + + # get fileid + string_ls = list( + 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890') + shuffled_string_ls = [] + seed = data1['seed'] + N = len(string_ls) + for ii in range(N): + seed = (seed * 0xd3 + 0x754f) % 0x10000 + idx = seed * len(string_ls) // 0x10000 + shuffled_string_ls.append(string_ls[idx]) + del string_ls[idx] + + fileid_dict = {} + for format in data1['streamtypes']: + streamfileid = [ + int(i) for i in data1['streamfileids'][format].strip('*').split('*')] + fileid = ''.join( + [shuffled_string_ls[i] for i in streamfileid]) + fileid_dict[format] = fileid[:8] + '%s' + fileid[10:] + + def get_fileid(format, n): + fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2) + return fileid + + # get ep + def generate_ep(format, n): + fileid = get_fileid(format, n) + ep_t = yk_t( + b'bf7e5f01', + ('%s_%s_%s' % (sid, fileid, token)).encode('ascii') + ) + ep = base64.b64encode(ep_t).decode('ascii') + return ep + + # generate video_urls + video_urls_dict = {} + for format in data1['streamtypes']: + video_urls = [] + for dt in data1['segs'][format]: + n = str(int(dt['no'])) + param = { + 'K': dt['k'], + 'hd': self.get_hd(format), + 'myp': 0, + 'ts': dt['seconds'], + 'ypp': 0, + 'ctype': 12, + 'ev': 1, + 'token': token, + 'oip': oip, + 'ep': generate_ep(format, n) + } + video_url = \ + 'http://k.youku.com/player/getFlvPath/' + \ + 'sid/' + sid + \ + '_' + str(int(n) + 1).zfill(2) + \ + '/st/' + self.parse_ext_l(format) + \ + '/fileid/' + get_fileid(format, n) + '?' + \ + compat_urllib_parse.urlencode(param) + video_urls.append(video_url) + video_urls_dict[format] = video_urls + + return video_urls_dict + + def get_hd(self, fm): + hd_id_dict = { + 'flv': '0', + 'mp4': '1', + 'hd2': '2', + 'hd3': '3', + '3gp': '0', + '3gphd': '1' } - } - - def _gen_sid(self): - nowTime = int(time.time() * 1000) - random1 = random.randint(1000, 1998) - random2 = random.randint(1000, 9999) - - return "%d%d%d" % (nowTime, random1, random2) - - def _get_file_ID_mix_string(self, seed): - mixed = [] - source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890") - seed = float(seed) - for i in range(len(source)): - seed = (seed * 211 + 30031) % 65536 - index = math.floor(seed / 65536 * len(source)) - mixed.append(source[int(index)]) - source.remove(source[int(index)]) - # return ''.join(mixed) - return mixed - - def _get_file_id(self, fileId, seed): - mixed = self._get_file_ID_mix_string(seed) - ids = fileId.split('*') - realId = [] - for ch in ids: - if ch: - realId.append(mixed[int(ch)]) - return ''.join(realId) + return hd_id_dict[fm] + + def parse_ext_l(self, fm): + ext_dict = { + 'flv': 'flv', + 'mp4': 'mp4', + 'hd2': 'flv', + 'hd3': 'flv', + '3gp': 'flv', + '3gphd': 'mp4' + } + return ext_dict[fm] + + def get_format_name(self, fm): + _dict = { + '3gp': 'h6', + '3gphd': 'h5', + 'flv': 'h4', + 'mp4': 'h3', + 'hd2': 'h2', + 'hd3': 'h1' + } + return _dict[fm] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id + video_id = self._match_id(url) - config = self._download_json(info_url, video_id) + def retrieve_data(req_url, note): + req = compat_urllib_request.Request(req_url) - error_code = config['data'][0].get('error_code') - if error_code: - # -8 means blocked outside China. - error = config['data'][0].get('error') # Chinese and English, separated by newline. - raise ExtractorError(error or 'Server reported error %i' % error_code, - expected=True) + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + req.add_header('Ytdl-request-proxy', cn_verification_proxy) - video_title = config['data'][0]['title'] - seed = config['data'][0]['seed'] + raw_data = self._download_json(req, video_id, note=note) + return raw_data['data'][0] - format = self._downloader.params.get('format', None) - supported_format = list(config['data'][0]['streamfileids'].keys()) + # request basic data + data1 = retrieve_data( + 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id, + 'Downloading JSON metadata 1') + data2 = retrieve_data( + 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id, + 'Downloading JSON metadata 2') - # TODO proper format selection - if format is None or format == 'best': - if 'hd2' in supported_format: - format = 'hd2' + error_code = data1.get('error_code') + if error_code: + error = data1.get('error') + if error is not None and '因版权原因无法观看此视频' in error: + raise ExtractorError( + 'Youku said: Sorry, this video is available in China only', expected=True) else: - format = 'flv' - ext = 'flv' - elif format == 'worst': - format = 'mp4' - ext = 'mp4' - else: - format = 'flv' - ext = 'flv' - - fileid = config['data'][0]['streamfileids'][format] - keys = [s['k'] for s in config['data'][0]['segs'][format]] - # segs is usually a dictionary, but an empty *list* if an error occured. - - files_info = [] - sid = self._gen_sid() - fileid = self._get_file_id(fileid, seed) - - # column 8,9 of fileid represent the segment number - # fileid[7:9] should be changed - for index, key in enumerate(keys): - temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:]) - download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) - - info = { - 'id': '%s_part%02d' % (video_id, index), - 'url': download_url, - 'uploader': None, - 'upload_date': None, - 'title': video_title, - 'ext': ext, - } - files_info.append(info) - - return files_info + msg = 'Youku server reported error %i' % error_code + if error is not None: + msg += ': ' + error + raise ExtractorError(msg) + + title = data1['title'] + + # generate video_urls_dict + video_urls_dict = self.construct_video_urls(data1, data2) + + # construct info + entries = [{ + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + 'formats': [], + # some formats are not available for all parts, we have to detect + # which one has all + } for i in range(max(len(v) for v in data1['segs'].values()))] + for fm in data1['streamtypes']: + video_urls = video_urls_dict[fm] + for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries): + entry['formats'].append({ + 'url': video_url, + 'format_id': self.get_format_name(fm), + 'ext': self.parse_ext_l(fm), + 'filesize': int(seg['size']), + }) + + return { + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'entries': entries, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3448bec4f..a3da56c14 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '44': {'ext': 'webm', 'width': 854, 'height': 480}, '45': {'ext': 'webm', 'width': 1280, 'height': 720}, '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480}, # 3d videos @@ -1504,7 +1506,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): for pagenum in itertools.count(1): url_query = { - 'search_query': query, + 'search_query': query.encode('utf-8'), 'page': pagenum, 'spf': 'navigate', } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 096ab6137..6aeca61ee 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -729,7 +729,7 @@ def parseOpts(overrideArguments=None): metavar='POLICY', dest='fixup', default='detect_or_warn', help='Automatically correct known faults of the file. ' 'One of never (do nothing), warn (only emit a warning), ' - 'detect_or_warn(the default; fix file if we can, warn otherwise)') + 'detect_or_warn (the default; fix file if we can, warn otherwise)') postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 774494efd..e19dbf73d 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -35,6 +35,11 @@ class EmbedThumbnailPP(FFmpegPostProcessor): thumbnail_filename = info['thumbnails'][-1]['filename'] + if not os.path.exists(encodeFilename(thumbnail_filename)): + self._downloader.report_warning( + 'Skipping embedding the thumbnail because the file is missing.') + return [], info + if info['ext'] == 'mp3': options = [ '-c', 'copy', '-map', '0', '-map', '1', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index cc65b34e7..fe7e0a8ee 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -21,6 +21,7 @@ from ..utils import ( shell_quote, subtitles_filename, dfxp2srt, + ISO639Utils, ) @@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): - # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt - _lang_map = { - 'aa': 'aar', - 'ab': 'abk', - 'ae': 'ave', - 'af': 'afr', - 'ak': 'aka', - 'am': 'amh', - 'an': 'arg', - 'ar': 'ara', - 'as': 'asm', - 'av': 'ava', - 'ay': 'aym', - 'az': 'aze', - 'ba': 'bak', - 'be': 'bel', - 'bg': 'bul', - 'bh': 'bih', - 'bi': 'bis', - 'bm': 'bam', - 'bn': 'ben', - 'bo': 'bod', - 'br': 'bre', - 'bs': 'bos', - 'ca': 'cat', - 'ce': 'che', - 'ch': 'cha', - 'co': 'cos', - 'cr': 'cre', - 'cs': 'ces', - 'cu': 'chu', - 'cv': 'chv', - 'cy': 'cym', - 'da': 'dan', - 'de': 'deu', - 'dv': 'div', - 'dz': 'dzo', - 'ee': 'ewe', - 'el': 'ell', - 'en': 'eng', - 'eo': 'epo', - 'es': 'spa', - 'et': 'est', - 'eu': 'eus', - 'fa': 'fas', - 'ff': 'ful', - 'fi': 'fin', - 'fj': 'fij', - 'fo': 'fao', - 'fr': 'fra', - 'fy': 'fry', - 'ga': 'gle', - 'gd': 'gla', - 'gl': 'glg', - 'gn': 'grn', - 'gu': 'guj', - 'gv': 'glv', - 'ha': 'hau', - 'he': 'heb', - 'hi': 'hin', - 'ho': 'hmo', - 'hr': 'hrv', - 'ht': 'hat', - 'hu': 'hun', - 'hy': 'hye', - 'hz': 'her', - 'ia': 'ina', - 'id': 'ind', - 'ie': 'ile', - 'ig': 'ibo', - 'ii': 'iii', - 'ik': 'ipk', - 'io': 'ido', - 'is': 'isl', - 'it': 'ita', - 'iu': 'iku', - 'ja': 'jpn', - 'jv': 'jav', - 'ka': 'kat', - 'kg': 'kon', - 'ki': 'kik', - 'kj': 'kua', - 'kk': 'kaz', - 'kl': 'kal', - 'km': 'khm', - 'kn': 'kan', - 'ko': 'kor', - 'kr': 'kau', - 'ks': 'kas', - 'ku': 'kur', - 'kv': 'kom', - 'kw': 'cor', - 'ky': 'kir', - 'la': 'lat', - 'lb': 'ltz', - 'lg': 'lug', - 'li': 'lim', - 'ln': 'lin', - 'lo': 'lao', - 'lt': 'lit', - 'lu': 'lub', - 'lv': 'lav', - 'mg': 'mlg', - 'mh': 'mah', - 'mi': 'mri', - 'mk': 'mkd', - 'ml': 'mal', - 'mn': 'mon', - 'mr': 'mar', - 'ms': 'msa', - 'mt': 'mlt', - 'my': 'mya', - 'na': 'nau', - 'nb': 'nob', - 'nd': 'nde', - 'ne': 'nep', - 'ng': 'ndo', - 'nl': 'nld', - 'nn': 'nno', - 'no': 'nor', - 'nr': 'nbl', - 'nv': 'nav', - 'ny': 'nya', - 'oc': 'oci', - 'oj': 'oji', - 'om': 'orm', - 'or': 'ori', - 'os': 'oss', - 'pa': 'pan', - 'pi': 'pli', - 'pl': 'pol', - 'ps': 'pus', - 'pt': 'por', - 'qu': 'que', - 'rm': 'roh', - 'rn': 'run', - 'ro': 'ron', - 'ru': 'rus', - 'rw': 'kin', - 'sa': 'san', - 'sc': 'srd', - 'sd': 'snd', - 'se': 'sme', - 'sg': 'sag', - 'si': 'sin', - 'sk': 'slk', - 'sl': 'slv', - 'sm': 'smo', - 'sn': 'sna', - 'so': 'som', - 'sq': 'sqi', - 'sr': 'srp', - 'ss': 'ssw', - 'st': 'sot', - 'su': 'sun', - 'sv': 'swe', - 'sw': 'swa', - 'ta': 'tam', - 'te': 'tel', - 'tg': 'tgk', - 'th': 'tha', - 'ti': 'tir', - 'tk': 'tuk', - 'tl': 'tgl', - 'tn': 'tsn', - 'to': 'ton', - 'tr': 'tur', - 'ts': 'tso', - 'tt': 'tat', - 'tw': 'twi', - 'ty': 'tah', - 'ug': 'uig', - 'uk': 'ukr', - 'ur': 'urd', - 'uz': 'uzb', - 've': 'ven', - 'vi': 'vie', - 'vo': 'vol', - 'wa': 'wln', - 'wo': 'wol', - 'xh': 'xho', - 'yi': 'yid', - 'yo': 'yor', - 'za': 'zha', - 'zh': 'zho', - 'zu': 'zul', - } - - @classmethod - def _conver_lang_code(cls, code): - """Convert language code from ISO 639-1 to ISO 639-2/T""" - return cls._lang_map.get(code[:2]) - def run(self, information): if information['ext'] not in ['mp4', 'mkv']: self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') @@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): opts += ['-c:s', 'mov_text'] for (i, lang) in enumerate(sub_langs): opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = self._conver_lang_code(lang) + lang_code = ISO639Utils.short2long(lang) if lang_code is not None: opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52d198fa3..a2746b2d1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1841,7 +1841,10 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): - _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + _x = functools.partial(xpath_with_ns, ns_map={ + 'ttml': 'http://www.w3.org/ns/ttml', + 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + }) def parse_node(node): str_or_empty = functools.partial(str_or_none, default='') @@ -1849,9 +1852,9 @@ def dfxp2srt(dfxp_data): out = str_or_empty(node.text) for child in node: - if child.tag in (_x('ttml:br'), 'br'): + if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): out += '\n' + str_or_empty(child.tail) - elif child.tag in (_x('ttml:span'), 'span'): + elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): out += str_or_empty(parse_node(child)) else: out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1860,7 +1863,7 @@ def dfxp2srt(dfxp_data): dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') @@ -1879,6 +1882,208 @@ def dfxp2srt(dfxp_data): return ''.join(out) +class ISO639Utils(object): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9cf84ff71..34a13cb81 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.06.04.1' +__version__ = '2015.06.15' |