diff options
45 files changed, 817 insertions, 593 deletions
@@ -124,3 +124,5 @@ Mohammad Teimori Pabandi Roman Le Négrate Matthias Küch Julian Richen +Ping O. +Mister Hat @@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43fbe8b1d..a4879bd9a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -142,6 +142,7 @@ - **Eporner** - **EroProfile** - **Escapist** + - **ESPN** (Currently broken) - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -338,6 +339,7 @@ - **OktoberfestTV** - **on.aol.com** - **Ooyala** + - **OoyalaExternal** - **OpenFilm** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -451,6 +453,7 @@ - **Spike** - **Sport5** - **SportBox** + - **SportBoxEmbed** - **SportDeutschland** - **Srf** - **SRMediathek**: Saarländischer Rundfunk @@ -510,6 +513,8 @@ - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV2** + - **TV2Article** - **TV4**: tv4.se and tv4play.se - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 891ee620b..c4e3adb67 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') class TestRaiSubtitles(BaseTestSubtitles): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 58b34e087..d1953c18f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1527,6 +1527,7 @@ class YoutubeDL(object): pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: + files_to_delete = [] try: files_to_delete, info = pp.run(info) except PostProcessingError as e: diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2a5cf9547..f73bf646b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -149,7 +149,6 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE -from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -244,6 +243,7 @@ from .kaltura import KalturaIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE @@ -338,8 +338,7 @@ from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE @@ -355,6 +354,7 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -438,7 +438,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE @@ -480,7 +479,6 @@ from .smotri import ( SmotriBroadcastIE, ) from .snotr import SnotrIE -from .sockshare import SockshareIE from .sohu import SohuIE from .soompi import ( SoompiIE, @@ -651,7 +649,10 @@ from .vine import ( VineIE, VineUserIE, ) -from .viki import VikiIE +from .viki import ( + VikiIE, + VikiChannelIE, +) from .vk import ( VKIE, VKUserVideosIE, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8273bd6c9..76de24477 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( find_xpath_attr, unified_strdate, - get_element_by_id, get_element_by_attribute, int_or_none, qualities, @@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): def _real_extract(self, url): anchor_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, anchor_id) - row = get_element_by_id(anchor_id, webpage) + row = self._search_regex( + r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id, + webpage, 'row') return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 7ca835e31..2103ed73a 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import re import itertools +import json +import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( @@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor): entries = [] - lq_doc = self._download_xml( + lq_page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, video_id, note='Downloading LQ video info' ) + try: + err_info = json.loads(lq_page) + raise ExtractorError( + 'BiliBili said: ' + err_info['error_text'], expected=True) + except ValueError: + pass + + lq_doc = ET.fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( @@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor): note='Downloading HQ video info', fatal=False, ) - hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None) - - assert len(lq_durls) == len(hq_durls) + if hq_doc is not False: + hq_durls = hq_doc.findall('./durl') + assert len(lq_durls) == len(hq_durls) + else: + hq_durls = itertools.repeat(None) i = 1 for lq_durl, hq_durl in zip(lq_durls, hq_durls): diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") + decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index cf0a7551b..c949a4814 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor): 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', } + }, + { + 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', + 'md5': '1376908e49572389e7b06251a53cdd08', + 'info_dict': { + 'id': 'Cinemassacre-555779690c440', + 'ext': 'mp4', + 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', + 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', + 'upload_date': '20150525', + } } ] @@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor): playerdata_url = self._search_regex( [ - r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index db10b8d00..70aa4333c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') - webpage = self._download_webpage(url, user) + webpage = self._download_webpage( + 'https://www.dailymotion.com/user/%s' % user, user) full_user = unescapeHTML(self._html_search_regex( r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), webpage, 'user')) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor): restricted_to_denmark = asset['RestrictedToDenmark'] spoken_subtitles = asset['Target'] == 'SpokenSubtitles' for link in asset['Links']: - target = link['Target'] uri = link['Uri'] + target = link['Target'] format_id = target - preference = -1 if target == 'HDS' else -2 + preference = None if spoken_subtitles: - preference -= 2 + preference = -1 format_id += '-spoken-subtitles' - formats.append({ - 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, - 'format_id': format_id, - 'ext': link['FileFormat'], - 'preference': preference, - }) + if target == 'HDS': + formats.extend(self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id)) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', preference=preference, + m3u8_id=format_id)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': link.get('FileFormat'), + }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 70f8efe27..9a5a8f4bb 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"' _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + } + }, + { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'matching_only': True, } - } + ] diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 937b28fcc..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', - } + }, + 'expected_warnings': [ + 'title' + ] }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', - fatal=False) + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', + default=None) if not video_title: video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', - webpage, 'alternative title', default=None) + webpage, 'alternative title', fatal=False) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py deleted file mode 100644 index 3191116d9..000000000 --- a/youtube_dl/extractor/firedrive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) - - -class FiredriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ - '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)' - _FILE_DELETED_REGEX = r'<div class="removed_file_image">' - - _TESTS = [{ - 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', - 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', - 'info_dict': { - 'id': 'FEB892FA160EBD01', - 'ext': 'flv', - 'title': 'bbb_theora_486kbit.flv', - 'thumbnail': 're:^http://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://firedrive.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - fields = dict(re.findall(r'''(?x)<input\s+ - type="hidden"\s+ - name="([^"]+)"\s+ - value="([^"]*)" - ''', webpage)) - - post = compat_urllib_parse.urlencode(fields) - req = compat_urllib_request.Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - # Apparently, this header is required for confirmation to work. - req.add_header('Host', 'www.firedrive.com') - - webpage = self._download_webpage(req, video_id, - 'Downloading video page') - - title = self._search_regex(r'class="external_title_left">(.+)</div>', - webpage, 'title') - thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, - 'thumbnail', fatal=False) - if thumbnail is not None: - thumbnail = 'http:' + thumbnail - - ext = self._search_regex(r'type:\s?\'([^\']+)\',', - webpage, 'extension', fatal=False) - video_url = self._search_regex( - r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': ext, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py new file mode 100644 index 000000000..bed94bc93 --- /dev/null +++ b/youtube_dl/extractor/karrierevideos.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + fix_xml_ampersands, + float_or_none, + xpath_with_ns, + xpath_text, +) + + +class KarriereVideosIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', + 'info_dict': { + 'id': '32c91', + 'ext': 'flv', + 'title': 'AltenpflegerIn', + 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # broken ampersands + 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', + 'info_dict': { + 'id': '5sniu', + 'ext': 'flv', + 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', + 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = (self._html_search_meta('title', webpage, default=None) or + self._search_regex(r'<h1 class="title">([^<]+)</h1>')) + + video_id = self._search_regex( + r'/config/video/(.+?)\.xml', webpage, 'video id') + playlist = self._download_xml( + 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, + video_id, transform_source=fix_xml_ampersands) + + NS_MAP = { + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./tracklist/item') + video_file = xpath_text( + item, ns('./jwplayer:file'), 'video url', fatal=True) + streamer = xpath_text( + item, ns('./jwplayer:streamer'), 'streamer', fatal=True) + + uploader = xpath_text( + item, ns('./jwplayer:author'), 'uploader') + duration = float_or_none( + xpath_text(item, ns('./jwplayer:duration'), 'duration')) + + description = self._html_search_regex( + r'(?s)<div class="leadtext">(.+?)</div>', + webpage, 'description') + + thumbnail = self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + if thumbnail: + thumbnail = compat_urlparse.urljoin(url, thumbnail) + + return { + 'id': video_id, + 'url': streamer.replace('rtmpt', 'rtmp'), + 'play_path': 'mp4:%s' % video_file, + 'ext': 'flv', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index d8897eb90..7091f3335 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '6a75fe9d0d3275bead0cb683c616fddb', 'info_dict': { 'id': '0fce117d', 'ext': 'mp4', @@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor): 'display_id': 'programa-144', 'duration': 2913, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor): episode, transform_source=strip_jsonp ) + formats = self._extract_m3u8_formats( + token_info['tokenizedUrl'], episode, ext='mp4') return { 'id': embed_data['videoId'], 'display_id': episode, 'title': info_el.find('title').text, - 'url': token_info['tokenizedUrl'], + 'formats': formats, 'description': get_element_by_attribute('class', 'text', webpage), 'thumbnail': info_el.find('thumb').text, 'duration': parse_duration(info_el.find('duration').text), diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c10405f04..925967753 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -16,7 +17,7 @@ from ..utils import ( class NaverIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://tvcast.naver.com/v/81652', 'info_dict': { 'id': '81652', @@ -25,7 +26,18 @@ class NaverIE(InfoExtractor): 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', 'upload_date': '20130903', }, - } + }, { + 'url': 'http://tvcast.naver.com/v/395837', + 'md5': '638ed4c12012c458fefcddfd01f173cd', + 'info_dict': { + 'id': '395837', + 'ext': 'mp4', + 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', + 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', + 'upload_date': '20150519', + }, + 'skip': 'Georestricted', + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +47,7 @@ class NaverIE(InfoExtractor): webpage) if m_id is None: m_error = re.search( - r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>', + r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>', webpage) if m_error: raise ExtractorError(clean_html(m_error.group('msg')), expected=True) @@ -58,14 +70,18 @@ class NaverIE(InfoExtractor): formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): domain = format_el.find('Domain').text + uri = format_el.find('uri').text f = { - 'url': domain + format_el.find('uri').text, + 'url': compat_urlparse.urljoin(domain, uri), 'ext': 'mp4', 'width': int(format_el.find('width').text), 'height': int(format_el.find('height').text), } if domain.startswith('rtmp'): + # urlparse does not support custom schemes + # https://bugs.python.org/issue18828 f.update({ + 'url': domain + uri, 'ext': 'flv', 'rtmp_protocol': '1', # rtmpt }) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 862b706bf..944096e1c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -22,6 +22,18 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, + }, { + 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'info_dict': { + 'id': '0041400301-cle-atl-recap.nba', + 'ext': 'mp4', + 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', + 'duration': 228, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): @@ -35,8 +47,12 @@ class NBAIE(InfoExtractor): self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') description = self._og_search_description(webpage) - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration')) + duration_str = self._html_search_meta( + 'duration', webpage, 'duration', default=None) + if not duration_str: + duration_str = self._html_search_regex( + r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) + duration = parse_duration(duration_str) return { 'id': shortened_video_id, diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE): return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): + _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:b23787119933404ce515c6356a8c355c', + 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', + 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } - }] - - _URL_PATTERN = r'\{url: \'(.+)\'\}' - - def _fetch_title(self, page): - return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title') - - def _fetch_thumbnail(self, page): - return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - - def _fetch_timestamp(self, page): - return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): - _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' - _TESTS = [{ + }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', 'info_dict': { @@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): 'expected_warnings': [ 'video thumbnail', ] + }, { + 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', + 'only_matching': True, }] + _URL_PATTERN = r'\{url: \'(.+)\'\}' + def _fetch_title(self, page): - return self._html_search_meta('description', page, 'news title') + return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or + self._html_search_meta('description', page, 'news title')) + + def _fetch_thumbnail(self, page): + return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + + def _fetch_timestamp(self, page): + return None def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py new file mode 100644 index 000000000..173e46cd8 --- /dev/null +++ b/youtube_dl/extractor/nowtv.py @@ -0,0 +1,192 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_duration, + remove_start, +) + + +class NowTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + + _TESTS = [{ + # rtl + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', + 'info_dict': { + 'id': '203519', + 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'ext': 'mp4', + 'title': 'Die neuen Bauern und eine Hochzeit', + 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432580700, + 'upload_date': '20150525', + 'duration': 2786, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtl2 + 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', + 'info_dict': { + 'id': '203481', + 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', + 'ext': 'mp4', + 'title': 'Berlin - Tag & Nacht (Folge 934)', + 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432666800, + 'upload_date': '20150526', + 'duration': 2641, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtlnitro + 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', + 'info_dict': { + 'id': '165780', + 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', + 'ext': 'mp4', + 'title': 'Hals- und Beinbruch', + 'description': 'md5:b50d248efffe244e6f56737f0911ca57', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432415400, + 'upload_date': '20150523', + 'duration': 2742, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # superrtl + 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', + 'info_dict': { + 'id': '99205', + 'display_id': 'medicopter-117/angst', + 'ext': 'mp4', + 'title': 'Angst!', + 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1222632900, + 'upload_date': '20080928', + 'duration': 3025, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # ntv + 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', + 'info_dict': { + 'id': '203521', + 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', + 'ext': 'mp4', + 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', + 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432751700, + 'upload_date': '20150527', + 'duration': 1083, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # vox + 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', + 'info_dict': { + 'id': '128953', + 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', + 'ext': 'mp4', + 'title': "Büro-Fall / Chihuahua 'Joel'", + 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432408200, + 'upload_date': '20150523', + 'duration': 3092, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + station = mobj.group('station') + + info = self._download_json( + 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, + display_id) + + video_id = compat_str(info['id']) + + files = info['files'] + if not files: + if info.get('geoblocked', False): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + + f = info.get('format', {}) + station = f.get('station') or station + + STATIONS = { + 'rtl': 'rtlnow', + 'rtl2': 'rtl2now', + 'vox': 'voxnow', + 'nitro': 'rtlnitronow', + 'ntv': 'n-tvnow', + 'superrtl': 'superrtlnow' + } + + formats = [] + for item in files['items']: + item_path = remove_start(item['path'], '/') + tbr = int_or_none(item['bitrate']) + m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) + m3u8_url = m3u8_url.replace('now/', 'now/videos/') + formats.append({ + 'url': m3u8_url, + 'format_id': '%s-%sk' % (item['id'], tbr), + 'ext': 'mp4', + 'tbr': tbr, + }) + self._sort_formats(formats) + + title = info['title'] + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index fbc521d1a..6c7149fe3 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( unified_strdate, int_or_none, @@ -11,8 +12,9 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)' _TESTS = [{ + # metadata in JSON 'url': 'http://ok.ru/video/20079905452', 'md5': '8e24ad2da6f387948e7a7d44eb8668fe', 'info_dict': { @@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Культура меняет нас (прекрасный ролик!))', 'duration': 100, - 'upload_date': '20141207', 'uploader_id': '330537914540', 'uploader': 'Виталий Добровольский', 'like_count': int, - 'age_limit': 0, + }, + }, { + # metadataUrl + 'url': 'http://ok.ru/video/63567059965189-0', + 'md5': '9676cf86eff5391d35dea675d224e131', + 'info_dict': { + 'id': '63567059965189-0', + 'ext': 'mp4', + 'title': 'Девушка без комплексов ...', + 'duration': 191, + 'uploader_id': '534380003155', + 'uploader': 'Андрей Мещанинов', + 'like_count': int, }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', @@ -34,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://ok.ru/video/%s' % video_id, video_id) player = self._parse_json( unescapeHTML(self._search_regex( r'data-attributes="([^"]+)"', webpage, 'player')), video_id) - metadata = self._parse_json(player['flashvars']['metadata'], video_id) + flashvars = player['flashvars'] + + metadata = flashvars.get('metadata') + if metadata: + metadata = self._parse_json(metadata, video_id) + else: + metadata = self._download_json( + compat_urllib_parse.unquote(flashvars['metadataUrl']), + video_id, 'Downloading metadata JSON') movie = metadata['movie'] title = movie['title'] @@ -53,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor): uploader = author.get('name') upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date')) + 'ya:ovs:upload_date', webpage, 'upload date', default=None)) age_limit = None adult = self._html_search_meta( - 'ya:ovs:adult', webpage, 'age limit') + 'ya:ovs:adult', webpage, 'age limit', default=None) if adult: age_limit = 18 if adult == 'true' else 0 diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0c8b731cf..daa284ea2 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -71,7 +71,8 @@ class PornHubIE(InfoExtractor): video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password')) + password = compat_urllib_parse.unquote_plus( + self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) formats = [] diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc799664..255d4abc1 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -17,7 +17,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index dce64e151..5a381d9ce 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) class RTBFIE(InfoExtractor): @@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor): 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', 'duration': 3099, - 'timestamp': 1398456336, - 'upload_date': '20140425', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) + webpage = self._download_webpage( + 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) - data = json.loads(self._html_search_regex( - r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data'] + data = self._parse_json( + unescapeHTML(self._search_regex( + r'data-video="([^"]+)"', webpage, 'data video')), + video_id) video_url = data.get('downloadUrl') or data.get('url') - if data['provider'].lower() == 'youtube': + if data.get('provider').lower() == 'youtube': return self.url_result(video_url, 'Youtube') return { @@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor): 'url': video_url, 'title': data['title'], 'description': data.get('description') or data.get('subtitle'), - 'thumbnail': data['thumbnail']['large'], + 'thumbnail': data.get('thumbnail'), 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': data['created'], - 'view_count': data['viewCount'], + 'timestamp': int_or_none(data.get('created')), + 'view_count': int_or_none(data.get('viewCount')), } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py deleted file mode 100644 index 785a8045e..000000000 --- a/youtube_dl/extractor/rtlnow.py +++ /dev/null @@ -1,174 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - unified_strdate, - int_or_none, -) - - -class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'''(?x) - (?:https?://)? - (?P<url> - (?P<domain> - rtl-now\.rtl\.de| - rtl2now\.rtl2\.de| - (?:www\.)?voxnow\.de| - (?:www\.)?rtlnitronow\.de| - (?:www\.)?superrtlnow\.de| - (?:www\.)?n-tvnow\.de) - /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\? - (?:container_id|film_id)=(?P<video_id>[0-9]+)& - player=1(?:&season=[0-9]+)?(?:&.*)? - )''' - - _TESTS = [ - { - 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', - 'info_dict': { - 'id': '90419', - 'ext': 'flv', - 'title': 'Ahornallee - Folge 1 - Der Einzug', - 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', - 'upload_date': '20070416', - 'duration': 1685, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', - 'info_dict': { - 'id': '69756', - 'ext': 'flv', - 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', - 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0', - 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', - 'upload_date': '20120519', - 'duration': 1245, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', - 'info_dict': { - 'id': '13883', - 'ext': 'flv', - 'title': 'Voxtours - Südafrika-Reporter II', - 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00', - 'upload_date': '20090627', - 'duration': 1800, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', - 'info_dict': { - 'id': '99205', - 'ext': 'flv', - 'title': 'Medicopter 117 - Angst!', - 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin', - 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', - 'upload_date': '20080928', - 'duration': 2691, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5', - 'info_dict': { - 'id': '188729', - 'ext': 'flv', - 'upload_date': '20150204', - 'description': 'md5:5e1ce23095e61a79c166d134b683cecc', - 'title': 'Der Bachelor - Folge 4', - } - }, { - 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_page_url = 'http://%s/' % mobj.group('domain') - video_id = mobj.group('video_id') - - webpage = self._download_webpage('http://' + mobj.group('url'), video_id) - - mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage) - if mobj: - raise ExtractorError(clean_html(mobj.group(1)), expected=True) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage, default=None) - - upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date')) - - mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage) - duration = int(mobj.group('seconds')) if mobj else None - - playerdata_url = self._html_search_regex( - r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url') - - playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML') - - videoinfo = playerdata.find('./playlist/videoinfo') - - formats = [] - for filename in videoinfo.findall('filename'): - mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text) - if mobj: - fmt = { - 'url': mobj.group('url'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': video_page_url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text) - if mobj: - fmt = { - 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - fmt = { - 'url': filename.text, - } - fmt.update({ - 'width': int_or_none(filename.get('width')), - 'height': int_or_none(filename.get('height')), - 'vbr': int_or_none(filename.get('bitrate')), - 'ext': 'flv', - }) - formats.append(fmt) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 849300140..82cd98ac7 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ from ..utils import ( def _decrypt_url(png): - encrypted_data = base64.b64decode(png) + encrypted_data = base64.b64decode(png.encode('utf-8')) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] length = struct_unpack('!I', text_chunk[:4])[0] diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index 55604637d..d9df06861 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor): @classmethod def _extract_url(cls, webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 26ced716e..9f3e944e7 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -47,7 +47,7 @@ class SharedIE(InfoExtractor): video_url = self._html_search_regex( r'data-url="([^"]+)"', video_page, 'video URL') title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title')).decode('utf-8') + 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py deleted file mode 100644 index b5fa6f1da..000000000 --- a/youtube_dl/extractor/sockshare.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - determine_ext, - ExtractorError, -) - -from .common import InfoExtractor - - -class SockshareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)' - _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>' - _TEST = { - 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', - 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', - 'info_dict': { - 'id': '437BE28B89D799D7', - 'title': 'big_buck_bunny_720p_surround.avi', - 'ext': 'avi', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://sockshare.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - confirm_hash = self._html_search_regex(r'''(?x)<input\s+ - type="hidden"\s+ - value="([^"]*)"\s+ - name="hash" - ''', webpage, 'hash') - - fields = { - "hash": confirm_hash.encode('utf-8'), - "confirm": "Continue as Free User" - } - - post = compat_urllib_parse.urlencode(fields) - req = compat_urllib_request.Request(url, post) - # Apparently, this header is required for confirmation to work. - req.add_header('Host', 'www.sockshare.com') - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - webpage = self._download_webpage( - req, video_id, 'Downloading video page') - - video_url = self._html_search_regex( - r'<a href="([^"]*)".+class="download_file_link"', - webpage, 'file url') - video_url = "http://www.sockshare.com" + video_url - title = self._html_search_regex(( - r'<h1>(.+)<strong>', - r'var name = "([^"]+)";'), - webpage, 'title', default=None) - thumbnail = self._html_search_regex( - r'<img\s+src="([^"]*)".+?name="bg"', - webpage, 'thumbnail', default=None) - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': determine_ext(title), - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index b936202f6..06d6e6640 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -71,7 +71,7 @@ class SpankwireIE(InfoExtractor): compat_urllib_parse.unquote, re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: - password = self._html_search_regex( + password = self._search_regex( r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') video_urls = list(map( diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 8686f9d11..86d509ae5 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -6,8 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - parse_duration, - parse_iso8601, + unified_strdate, ) @@ -20,11 +19,9 @@ class SportBoxIE(InfoExtractor): 'id': '80822', 'ext': 'mp4', 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', + 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1411896237, 'upload_date': '20140928', - 'duration': 4846, }, 'params': { # m3u8 download @@ -48,17 +45,13 @@ class SportBoxIE(InfoExtractor): r'src="/?(vdl/player/[^"]+)"', webpage, 'player') title = self._html_search_regex( - r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') - description = self._html_search_regex( - r'(?s)<div itemprop="description">(.+?)</div>', - webpage, 'description', fatal=False) + [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], + webpage, 'title') + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'<span itemprop="uploadDate">([^<]+)</span>', - webpage, 'timestamp', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'<meta itemprop="duration" content="PT([^"]+)">', - webpage, 'duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'dateCreated', webpage, 'upload date')) return { '_type': 'url_transparent', @@ -67,8 +60,7 @@ class SportBoxIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 56be52638..b2a4b1fc0 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, qualities, + determine_ext, ) from ..compat import compat_ord @@ -108,7 +109,7 @@ class TeamcocoIE(InfoExtractor): formats = [] get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) for filed in data['files']: - if filed['type'] == 'hls': + if determine_ext(filed['url']) == 'm3u8': formats.extend(self._extract_m3u8_formats( filed['url'], video_id, ext='mp4')) else: diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 251a68680..a0c744fd1 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE): 'title': 'Con Martín Berasategui, hacer un bacalao al ...', 'duration': 662, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index 466155ef8..f6694149b 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, +) class TenPlayIE(InfoExtractor): @@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor): if protocol == 'rtmp': url = url.replace('&mp4:', '') + tbr = int_or_none(rendition.get('encodingRate'), 1000) + formats.append({ - 'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]), - 'width': rendition['frameWidth'], - 'height': rendition['frameHeight'], - 'tbr': rendition['encodingRate'] / 1024, - 'filesize': rendition['size'], + 'format_id': '_'.join( + ['rtmp', rendition['videoContainer'].lower(), + rendition['videoCodec'].lower(), '%sk' % tbr]), + 'width': int_or_none(rendition['frameWidth']), + 'height': int_or_none(rendition['frameHeight']), + 'tbr': tbr, + 'filesize': int_or_none(rendition['size']), 'protocol': protocol, 'ext': ext, 'vcodec': rendition['videoCodec'].lower(), 'container': rendition['videoContainer'].lower(), 'url': url, }) + self._sort_formats(formats) return { 'id': video_id, @@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor): 'url': json['thumbnailURL'] }], 'thumbnail': json['videoStillURL'], - 'duration': json['length'] / 1000, - 'timestamp': float(json['creationDate']) / 1000, - 'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay', - 'view_count': json['playsTotal'] + 'duration': float_or_none(json.get('length'), 1000), + 'timestamp': float_or_none(json.get('creationDate'), 1000), + 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', + 'view_count': int_or_none(json.get('playsTotal')), } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 025d0877c..656410528 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,8 +6,8 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' - _TESTS = { + _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' + _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { 'id': '10635995', @@ -32,7 +32,10 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, - } + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index d48cbbf14..59af9aba0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,26 +10,32 @@ from ..utils import ( class TNAFlixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', - 'info_dict': { - 'id': '553878', - 'display_id': 'Carmella-Decesare-striptease', - 'ext': 'mp4', - 'title': 'Carmella Decesare - striptease', - 'description': '', - 'thumbnail': 're:https?://.*\.jpg$', - 'duration': 91, - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'info_dict': { + 'id': '553878', + 'display_id': 'Carmella-Decesare-striptease', + 'ext': 'mp4', + 'title': 'Carmella Decesare - striptease', + 'description': '', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 91, + 'age_limit': 18, + } + }, + { + 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', + 'matching_only': True, } - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4de0aac52..fad720b68 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -26,7 +26,7 @@ class TutvIE(InfoExtractor): data_content = self._download_webpage( 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') + video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8') return { 'id': internal_id, diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index ececc7ee0..591024ead 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor): formats = [ { - 'url': base64.b64decode(res['u']).decode('utf-8'), + 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'), 'ext': 'flv', 'format_id': res['l'], } for res in settings['res'] if res['u'] diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index cf6af1e5c..7f2fb1ca8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,29 +1,65 @@ from __future__ import unicode_literals -import re +import time +import hmac +import hashlib +import itertools -from ..compat import ( - compat_urlparse, - compat_urllib_request, -) from ..utils import ( ExtractorError, - unescapeHTML, - unified_strdate, - US_RATINGS, - determine_ext, - mimetype2ext, + int_or_none, + parse_age_limit, + parse_iso8601, ) from .common import InfoExtractor -class VikiIE(InfoExtractor): - IE_NAME = 'viki' +class VikiBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' + _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' + _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' + + _APP = '65535a' + _APP_VERSION = '2.2.5.1428709186' + _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + + def _prepare_call(self, path, timestamp=None): + path += '?' if '?' not in path else '&' + if not timestamp: + timestamp = int(time.time()) + query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) + sig = hmac.new( + self._APP_SECRET.encode('ascii'), + query.encode('ascii'), + hashlib.sha1 + ).hexdigest() + return self._API_URL_TEMPLATE % (query, sig) + + def _call_api(self, path, video_id, note, timestamp=None): + resp = self._download_json( + self._prepare_call(path, timestamp), video_id, note) + + error = resp.get('error') + if error: + if error == 'invalid timestamp': + resp = self._download_json( + self._prepare_call(path, int(resp['current_timestamp'])), + video_id, '%s (retry)' % note) + error = resp.get('error') + if error: + self._raise_error(resp['error']) + + return resp - # iPad2 - _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5' + def _raise_error(self, error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), + expected=True) - _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' + +class VikiIE(VikiBaseIE): + IE_NAME = 'viki' + _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -37,111 +73,218 @@ class VikiIE(InfoExtractor): }, 'skip': 'Blocked in the US', }, { + # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c', + 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', 'info_dict': { 'id': '1067139v', 'ext': 'mp4', + 'title': "'The Avengers: Age of Ultron' Press Conference", 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', + 'duration': 352, + 'timestamp': 1430380829, 'upload_date': '20150430', - 'title': '\'The Avengers: Age of Ultron\' Press Conference', + 'uploader': 'Arirang TV', + 'like_count': int, + 'age_limit': 0, } }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { 'id': '1048879v', 'ext': 'mp4', - 'upload_date': '20140820', - 'description': 'md5:54ff56d51bdfc7a30441ec967394e91c', 'title': 'Ankhon Dekhi', + 'duration': 6512, + 'timestamp': 1408532356, + 'upload_date': '20140820', + 'uploader': 'Spuul', + 'like_count': int, + 'age_limit': 13, }, 'params': { - # requires ffmpeg + # m3u8 download 'skip_download': True, } + }, { + # episode + 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', + 'md5': '190f3ef426005ba3a080a63325955bc3', + 'info_dict': { + 'id': '44699v', + 'ext': 'mp4', + 'title': 'Boys Over Flowers - Episode 1', + 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2', + 'duration': 4155, + 'timestamp': 1270496524, + 'upload_date': '20100405', + 'uploader': 'group8', + 'like_count': int, + 'age_limit': 13, + } + }, { + # youtube external + 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', + 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', + 'info_dict': { + 'id': '50562v', + 'ext': 'mp4', + 'title': 'Poor Nastya [COMPLETE] - Episode 1', + 'description': '', + 'duration': 607, + 'timestamp': 1274949505, + 'upload_date': '20101213', + 'uploader': 'ad14065n', + 'uploader_id': 'ad14065n', + 'like_count': int, + 'age_limit': 13, + } + }, { + 'url': 'http://www.viki.com/player/44699v', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - uploader_m = re.search( - r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage) - if uploader_m is None: - uploader = None - else: - uploader = uploader_m.group(1).strip() - - rating_str = self._html_search_regex( - r'<strong>Rating: </strong>\s*([^<]*)<', webpage, - 'rating information', default='').strip() - age_limit = US_RATINGS.get(rating_str) - - req = compat_urllib_request.Request( - 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id) - req.add_header('User-Agent', self._USER_AGENT) - info_webpage = self._download_webpage( - req, video_id, note='Downloading info page') - err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None) - if err_msg: - if 'not available in your region' in err_msg: - raise ExtractorError( - 'Video %s is blocked from your location.' % video_id, - expected=True) - else: - raise ExtractorError('Viki said: ' + err_msg) - mobj = re.search( - r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage) - if not mobj: - raise ExtractorError('Unable to find video URL') - video_url = unescapeHTML(mobj.group('url')) - video_ext = mimetype2ext(mobj.group('mime_type')) - - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats( - video_url, video_id, ext=video_ext) - else: - formats = [{ - 'url': video_url, - 'ext': video_ext, - }] - - upload_date_str = self._html_search_regex( - r'"created_at":"([^"]+)"', info_webpage, 'upload date') - upload_date = ( - unified_strdate(upload_date_str) - if upload_date_str is not None - else None - ) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, info_webpage) - - return { + video = self._call_api( + 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + + title = None + titles = video.get('titles') + if titles: + title = titles.get('en') or titles[titles.keys()[0]] + if not title: + title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = video.get('container', {}).get('titles') + if container_titles: + container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]] + title = '%s - %s' % (container_title, title) + + descriptions = video.get('descriptions') + description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None + + duration = int_or_none(video.get('duration')) + timestamp = parse_iso8601(video.get('created_at')) + uploader = video.get('author') + like_count = int_or_none(video.get('likes', {}).get('count')) + age_limit = parse_age_limit(video.get('rating')) + + thumbnails = [] + for thumbnail_id, thumbnail in video.get('images', {}).items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail.get('url'), + }) + + subtitles = {} + for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': subtitles_format, + 'url': self._prepare_call( + 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + } for subtitles_format in ('srt', 'vtt')] + + result = { 'id': video_id, 'title': title, - 'formats': formats, 'description': description, - 'thumbnail': thumbnail, - 'age_limit': age_limit, + 'duration': duration, + 'timestamp': timestamp, 'uploader': uploader, - 'subtitles': video_subtitles, - 'upload_date': upload_date, + 'like_count': like_count, + 'age_limit': age_limit, + 'thumbnails': thumbnails, + 'subtitles': subtitles, } - def _get_subtitles(self, video_id, info_webpage): - res = {} - for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage): - sturl = unescapeHTML(sturl_html) - m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) - if not m: - continue - res[m.group('lang')] = [{ - 'url': compat_urlparse.urljoin('http://www.viki.com', sturl), - 'ext': 'vtt', - }] - return res + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + formats = [] + for format_id, stream_dict in streams.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + for protocol, format_dict in stream_dict.items(): + if format_id == 'm3u8': + formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + else: + formats.append({ + 'url': format_dict['url'], + 'format_id': '%s-%s' % (format_id, protocol), + 'height': height, + }) + self._sort_formats(formats) + + result['formats'] = formats + return result + + +class VikiChannelIE(VikiBaseIE): + IE_NAME = 'viki:channel' + _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', + 'info_dict': { + 'id': '50c', + 'title': 'Boys Over Flowers', + 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + }, + 'playlist_count': 70, + }, { + 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', + 'info_dict': { + 'id': '1354c', + 'title': 'Poor Nastya [COMPLETE]', + 'description': 'md5:05bf5471385aa8b21c18ad450e350525', + }, + 'playlist_count': 127, + }, { + 'url': 'http://www.viki.com/news/24569c-showbiz-korea', + 'only_matching': True, + }, { + 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', + 'only_matching': True, + }, { + 'url': 'http://www.viki.com/artists/2141c-shinee', + 'only_matching': True, + }] + + _PER_PAGE = 25 + + def _real_extract(self, url): + channel_id = self._match_id(url) + + channel = self._call_api( + 'containers/%s.json' % channel_id, channel_id, + 'Downloading channel JSON') + + titles = channel['titles'] + title = titles.get('en') or titles[titles.keys()[0]] + + descriptions = channel['descriptions'] + description = descriptions.get('en') or descriptions[descriptions.keys()[0]] + + entries = [] + for video_type in ('episodes', 'clips', 'movies'): + for page_num in itertools.count(1): + page = self._call_api( + 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' + % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, + 'Downloading %s JSON page #%d' % (video_type, page_num)) + for video in page['response']: + video_id = video['id'] + entries.append(self.url_result( + 'http://www.viki.com/videos/%s' % video_id, 'Viki')) + if not page['pagination']['next']: + break + + return self.playlist_result(entries, channel_id, title, description) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f9940cf5..0301682b8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1126,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) else: - # Hide the formats we found through non-DASH + # Remove the formats we found through non-DASH, they + # contain less info and it can be wrong, because we use + # fixed values (for example the resolution). See + # https://github.com/rg3/youtube-dl/issues/5774 for an + # example. dash_keys = set(df['format_id'] for df in dash_formats) - for f in formats: - if f['format_id'] in dash_keys: - f['format_id'] = 'nondash-%s' % f['format_id'] - f['preference'] = f.get('preference', 0) - 10000 + formats = [f for f in formats if f['format_id'] not in dash_keys] formats.extend(dash_formats) # Check for malformed aspect ratio diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 22dbc3aec..5a2315bd9 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None): verbosity.add_option( '--dump-pages', '--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, - help='Print downloaded pages to debug problems (very verbose)') + help='Print downloaded pages encoded using base64 to debug problems (very verbose)') verbosity.add_option( '--write-pages', action='store_true', dest='write_pages', default=False, @@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None): help='Parse additional metadata like song title / artist from the video title. ' 'The format syntax is the same as --output, ' 'the parsed parameters replace existing values. ' - 'Additional templates: %(album), %(artist). ' + 'Additional templates: %(album)s, %(artist)s. ' 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' '"Coldplay - Paradise"') postproc.add_option( diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 38f00bc9b..b33385153 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.15' +__version__ = '2015.05.20' |