diff options
31 files changed, 845 insertions, 310 deletions
@@ -400,7 +400,7 @@ which means you can modify it, redistribute it or use it however you like. downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' - --convert-subtitles FORMAT Convert the subtitles to other format + --convert-subs FORMAT Convert the subtitles to other format (currently supported: srt|ass|vtt) # CONFIGURATION diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1646277ec..84c166805 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -125,7 +125,7 @@ - **dailymotion:user** - **DailymotionCloud** - **daum.net** - - **daum.net** + - **daum.net:clip** - **DBTV** - **DCN** - **dcn:live** @@ -164,7 +164,7 @@ - **Eporner** - **EroProfile** - **Escapist** - - **ESPN** (Currently broken) + - **ESPN** - **EsriVideo** - **Europa** - **EveryonesMixtape** @@ -181,6 +181,7 @@ - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** + - **FOX** - **Foxgay** - **FoxNews**: Fox News and Fox Business Video - **FoxSports** @@ -259,7 +260,6 @@ - **JeuxVideo** - **Jove** - **jpopsuki.tv** - - **Jukebox** - **JWPlatform** - **Kaltura** - **KanalPlay**: Kanal 5/9/11 Play @@ -367,11 +367,13 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 + - **nextmovie.com** - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** - **nhl.com:news**: NHL news - **nhl.com:videocenter**: NHL videocenter category + - **nick.com** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **njoy**: N-JOY @@ -405,17 +407,19 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** + - **OraTV** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek + - **pandora.tv**: 판도라TV - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **Periscope**: Periscope - **PhilharmonieDeParis**: Philharmonie de Paris - - **Phoenix** + - **phoenix.de** - **Photobucket** - **Pinkbike** - **Pladform** @@ -457,6 +461,7 @@ - **RBMARadio** - **RDS**: RDS.ca - **RedTube** + - **RegioTV** - **Restudy** - **ReverbNation** - **RingTV** @@ -535,7 +540,8 @@ - **SportBoxEmbed** - **SportDeutschland** - **Sportschau** - - **Srf** + - **SRGSSR** + - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** @@ -579,7 +585,6 @@ - **THVideo** - **THVideoPlaylist** - **tinypic**: tinypic.com videos - - **tlc.com** - **tlc.de** - **TMZ** - **TMZArticle** @@ -608,6 +613,7 @@ - **TVC** - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru + - **tvland.com** - **tvp.pl** - **tvp.pl:Series** - **TVPlay**: TV3Play and related services @@ -646,6 +652,9 @@ - **VideoDetective** - **videofy.me** - **VideoMega** + - **videomore** + - **videomore:season** + - **videomore:video** - **VideoPremium** - **VideoTt**: video.tt - Your True Tube - **videoweed**: VideoWeed diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 50425b8d7..3b2be3159 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1791,6 +1791,10 @@ class YoutubeDL(object): res = '' if fdict.get('ext') in ['f4f', 'f4m']: res += '(unsupported) ' + if fdict.get('language'): + if res: + res += ' ' + res += '[%s]' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb7151443..4c7e5223d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -203,6 +203,7 @@ from .flickr import FlickrIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .fourtube import FourTubeIE +from .fox import FOXIE from .foxgay import FoxgayIE from .foxnews import FoxNewsIE from .foxsports import FoxSportsIE @@ -433,6 +434,7 @@ from .nextmedia import ( NextMediaActionNewsIE, AppleDailyIE, ) +from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( @@ -440,6 +442,7 @@ from .nhl import ( NHLNewsIE, NHLVideocenterIE, ) +from .nick import NickIE from .niconico import NiconicoIE, NiconicoPlaylistIE from .ninegag import NineGagIE from .noco import NocoIE @@ -489,12 +492,14 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .ora import OraTVIE from .orf import ( ORFTVthekIE, ORFOE1IE, ORFFM4IE, ORFIPTVIE, ) +from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE @@ -549,8 +554,10 @@ from .rai import ( from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redtube import RedTubeIE +from .regiotv import RegioTVIE from .restudy import RestudyIE from .reverbnation import ReverbNationIE +from .revision3 import Revision3IE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE @@ -680,7 +687,6 @@ from .telemb import TeleMBIE from .teletask import TeleTaskIE from .tenplay import TenPlayIE from .testurl import TestURLIE -from .testtube import TestTubeIE from .tf1 import TF1IE from .theintercept import TheInterceptIE from .theonion import TheOnionIE @@ -692,7 +698,7 @@ from .thesixtyone import TheSixtyOneIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .tinypic import TinyPicIE -from .tlc import TlcIE, TlcDeIE +from .tlc import TlcDeIE from .tmz import ( TMZIE, TMZArticleIE, @@ -735,6 +741,7 @@ from .tvc import ( TVCArticleIE, ) from .tvigle import TvigleIE +from .tvland import TVLandIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .tweakers import TweakersIE diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index e37ee4440..76b21e596 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): @@ -14,8 +14,8 @@ class BaiduVideoIE(InfoExtractor): 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { 'id': '1069', - 'title': '中华小当家 TV版 (全52集)', - 'description': 'md5:395a419e41215e531c857bb037bbaf80', + 'title': '中华小当家 TV版国语', + 'description': 'md5:51be07afe461cf99fa61231421b5397c', }, 'playlist_count': 52, }, { @@ -25,45 +25,32 @@ class BaiduVideoIE(InfoExtractor): 'title': 're:^奔跑吧兄弟', 'description': 'md5:1bf88bad6d850930f542d51547c089b8', }, - 'playlist_mincount': 3, + 'playlist_mincount': 12, }] + def _call_api(self, path, category, playlist_id, note): + return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % ( + path, category, playlist_id), playlist_id, note) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - category = category2 = mobj.group('type') + category, playlist_id = re.match(self._VALID_URL, url).groups() if category == 'show': - category2 = 'tvshow' - - webpage = self._download_webpage(url, playlist_id) - - playlist_title = self._html_search_regex( - r'title\s*:\s*(["\'])(?P<title>[^\']+)\1', webpage, - 'playlist title', group='title') - playlist_description = self._html_search_regex( - r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage, - playlist_id, 'playlist description') + category = 'tvshow' + if category == 'tv': + category = 'tvplay' - site = self._html_search_regex( - r'filterSite\s*:\s*["\']([^"]*)["\']', webpage, - 'primary provider site') - api_result = self._download_json( - 'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % ( - category, category2, playlist_id, site), - playlist_id, 'Get playlist links') + playlist_detail = self._call_api( + 'xqinfo', category, playlist_id, 'Download playlist JSON metadata') - entries = [] - for episode in api_result[0]['episodes']: - episode_id = '%s_%s' % (playlist_id, episode['episode']) + playlist_title = playlist_detail['title'] + playlist_description = unescapeHTML(playlist_detail.get('intro')) - redirect_page = self._download_webpage( - compat_urlparse.urljoin(url, episode['url']), episode_id, - note='Download Baidu redirect page') - real_url = self._html_search_regex( - r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL') + episodes_detail = self._call_api( + 'xqsingle', category, playlist_id, 'Download episodes JSON metadata') - entries.append(self.url_result( - real_url, video_title=episode['single_title'])) + entries = [self.url_result( + episode['url'], video_title=episode['title'] + ) for episode in episodes_detail['videos']] return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 923273fb2..7b169881a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -23,7 +23,17 @@ class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _ID_REGEX = r'[pb][\da-z]{7}' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>%s)' % _ID_REGEX + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?bbc\.co\.uk/ + (?: + programmes/(?!articles/)| + iplayer(?:/[^/]+)?/(?:episode/|playlist/)| + music/clips[/#]| + radio/player/ + ) + (?P<id>%s) + ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -193,6 +203,9 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', + 'only_matching': True, } ] @@ -469,7 +482,8 @@ class BBCCoUkIE(InfoExtractor): if programme_id: formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', webpage, 'title') description = self._search_regex( r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', webpage, 'description', default=None) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 004372f8d..25b2d4efe 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -10,13 +10,14 @@ from ..utils import ( unified_strdate, url_basename, qualities, + int_or_none, ) class CanalplusIE(InfoExtractor): IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))' - _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { 'canalplus.fr': 'cplus', 'piwiplus.fr': 'teletoon', @@ -26,10 +27,10 @@ class CanalplusIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', - 'md5': 'b3481d7ca972f61e37420798d0a9d934', + 'md5': '12164a6f14ff6df8bd628e8ba9b10b78', 'info_dict': { 'id': '1263092', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Le Zapping - 13/05/15', 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', 'upload_date': '20150513', @@ -56,10 +57,10 @@ class CanalplusIE(InfoExtractor): 'skip': 'videos get deleted after a while', }, { 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', - 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4', + 'md5': '38b8f7934def74f0d6f3ba6c036a5f82', 'info_dict': { 'id': '1213714', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45', 'description': 'md5:8216206ec53426ea6321321f3b3c16db', 'upload_date': '20150211', @@ -82,15 +83,16 @@ class CanalplusIE(InfoExtractor): webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) - doc = self._download_xml(info_url, video_id, 'Downloading video XML') + video_data = self._download_json(info_url, video_id, 'Downloading video JSON') - video_info = [video for video in doc if video.find('ID').text == video_id][0] - media = video_info.find('MEDIA') - infos = video_info.find('INFOS') + if isinstance(video_data, list): + video_data = [video for video in video_data if video.get('ID') == video_id][0] + media = video_data['MEDIA'] + infos = video_data['INFOS'] - preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS']) + preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) - fmt_url = next(iter(media.find('VIDEOS'))).text + fmt_url = next(iter(media.get('VIDEOS'))) if '/geo' in fmt_url.lower(): response = self._request_webpage( HEADRequest(fmt_url), video_id, @@ -101,35 +103,42 @@ class CanalplusIE(InfoExtractor): expected=True) formats = [] - for fmt in media.find('VIDEOS'): - format_url = fmt.text + for format_id, format_url in media['VIDEOS'].items(): if not format_url: continue - format_id = fmt.tag if format_id == 'HLS': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', preference=preference(format_id))) + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) elif format_id == 'HDS': formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id))) + format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - 'url': format_url, + # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), }) self._sort_formats(formats) + thumbnails = [{ + 'id': image_id, + 'url': image_url, + } for image_id, image_url in media.get('images', {}).items()] + + titrage = infos['TITRAGE'] + return { 'id': video_id, 'display_id': display_id, - 'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text, - infos.find('TITRAGE/SOUS_TITRE').text), - 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), - 'thumbnail': media.find('IMAGES/GRAND').text, - 'description': infos.find('DESCRIPTION').text, - 'view_count': int(infos.find('NB_VUES').text), - 'like_count': int(infos.find('NB_LIKES').text), - 'comment_count': int(infos.find('NB_COMMENTS').text), + 'title': '%s - %s' % (titrage['TITRE'], + titrage['SOUS_TITRE']), + 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), + 'thumbnails': thumbnails, + 'description': infos.get('DESCRIPTION'), + 'duration': int_or_none(infos.get('DURATION')), + 'view_count': int_or_none(infos.get('NB_VUES')), + 'like_count': int_or_none(infos.get('NB_LIKES')), + 'comment_count': int_or_none(infos.get('NB_COMMENTS')), 'formats': formats, } diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index f9a64a0a2..d211ec23b 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -5,6 +5,7 @@ import re import json from .common import InfoExtractor +from ..utils import remove_start class CBSNewsIE(InfoExtractor): @@ -62,6 +63,7 @@ class CBSNewsIE(InfoExtractor): uri = item.get('media' + format_id + 'URI') if not uri: continue + uri = remove_start(uri, '{manifest:none}') fmt = { 'url': uri, 'format_id': format_id, @@ -70,6 +72,8 @@ class CBSNewsIE(InfoExtractor): play_path = re.sub( r'{slistFilePath}', '', uri.split('<break>')[-1].split('{break}')[-1]) + play_path = re.sub( + r'{manifest:.+}.*$', '', play_path) fmt.update({ 'app': 'ondemand?auth=cbs', 'play_path': 'mp4:' + play_path, diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 6924eac70..e94b1e35b 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, + parse_duration, qualities, unified_strdate, ) @@ -12,21 +13,25 @@ from ..utils import ( class CCCIE(InfoExtractor): IE_NAME = 'media.ccc.de' - _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/[^?#]+/[^?#/]*?_(?P<id>[0-9]{8,})._[^?#/]*\.html' + _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video', + _TESTS = [{ + 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { - 'id': '20131228183', + 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', 'ext': 'mp4', 'title': 'Introduction to Processor Design', - 'description': 'md5:5ddbf8c734800267f2cee4eab187bc1b', + 'description': 'md5:80be298773966f66d56cb11260b879af', 'thumbnail': 're:^https?://.*\.jpg$', 'view_count': int, - 'upload_date': '20131229', + 'upload_date': '20131228', + 'duration': 3660, } - } + }, { + 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -40,21 +45,25 @@ class CCCIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h1>(.*?)</h1>', webpage, 'title') description = self._html_search_regex( - r"(?s)<p class='description'>(.*?)</p>", + r"(?s)<h3>About</h3>(.+?)<h3>", webpage, 'description', fatal=False) upload_date = unified_strdate(self._html_search_regex( - r"(?s)<span class='[^']*fa-calendar-o'></span>(.*?)</li>", + r"(?s)<span[^>]+class='[^']*fa-calendar-o'[^>]*>(.+?)</span>", webpage, 'upload date', fatal=False)) view_count = int_or_none(self._html_search_regex( r"(?s)<span class='[^']*fa-eye'></span>(.*?)</li>", webpage, 'view count', fatal=False)) + duration = parse_duration(self._html_search_regex( + r'(?s)<span[^>]+class=(["\']).*?fa-clock-o.*?\1[^>]*></span>(?P<duration>.+?)</li', + webpage, 'duration', fatal=False, group='duration')) matches = re.finditer(r'''(?xs) - <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s* + <(?:span|div)\s+class='label\s+filetype'>(?P<format>[^<]*)</(?:span|div)>\s* + <(?:span|div)\s+class='label\s+filetype'>(?P<lang>[^<]*)</(?:span|div)>\s* <a\s+download\s+href='(?P<http_url>[^']+)'>\s* (?: .*? - <a\s+href='(?P<torrent_url>[^']+\.torrent)' + <a\s+(?:download\s+)?href='(?P<torrent_url>[^']+\.torrent)' )?''', webpage) formats = [] for m in matches: @@ -62,12 +71,15 @@ class CCCIE(InfoExtractor): format_id = self._search_regex( r'.*/([a-z0-9_-]+)/[^/]*$', m.group('http_url'), 'format id', default=None) + if format_id: + format_id = m.group('lang') + '-' + format_id vcodec = 'h264' if 'h264' in format_id else ( 'none' if format_id in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, 'format': format, + 'language': m.group('lang'), 'url': m.group('http_url'), 'vcodec': vcodec, 'preference': preference(format_id), @@ -95,5 +107,6 @@ class CCCIE(InfoExtractor): 'thumbnail': thumbnail, 'view_count': view_count, 'upload_date': upload_date, + 'duration': duration, 'formats': formats, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c63157619..0719c7bcd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -108,8 +108,9 @@ class InfoExtractor(object): -2 or smaller for less than default. < -1000 to hide the format (if there is another one which is strictly better) - * language_preference Is this in the correct requested - language? + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? 10 if it's what the URL is about, -1 for default (don't know), -10 otherwise, other values reserved for now. diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index d6723ecf2..ce680a9f3 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -9,7 +9,17 @@ from ..compat import compat_str class DiscoveryIE(InfoExtractor): - _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?' + _VALID_URL = r'''(?x)http://(?:www\.)?(?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocity + )\.com/(?:[^/]+/)*(?P<id>[^./?#]+)''' _TESTS = [{ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', 'info_dict': { @@ -21,8 +31,8 @@ class DiscoveryIE(InfoExtractor): 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' ' back.'), 'duration': 156, - 'timestamp': 1303099200, - 'upload_date': '20110418', + 'timestamp': 1302032462, + 'upload_date': '20110405', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -33,27 +43,38 @@ class DiscoveryIE(InfoExtractor): 'id': 'mythbusters-the-simpsons', 'title': 'MythBusters: The Simpsons', }, - 'playlist_count': 9, + 'playlist_mincount': 10, + }, { + 'url': 'http://www.animalplanet.com/longfin-eels-maneaters/', + 'info_dict': { + 'id': '78326', + 'ext': 'mp4', + 'title': 'Longfin Eels: Maneaters?', + 'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.', + 'upload_date': '20140725', + 'timestamp': 1406246400, + 'duration': 116, + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json(url + '?flat=1', video_id) + display_id = self._match_id(url) + info = self._download_json(url + '?flat=1', display_id) video_title = info.get('playlist_title') or info.get('video_title') entries = [{ 'id': compat_str(video_info['id']), 'formats': self._extract_m3u8_formats( - video_info['src'], video_id, ext='mp4', + video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', note='Download m3u8 information for video %d' % (idx + 1)), 'title': video_info['title'], 'description': video_info.get('description'), 'duration': parse_duration(video_info.get('video_length')), - 'webpage_url': video_info.get('href'), + 'webpage_url': video_info.get('href') or video_info.get('url'), 'thumbnail': video_info.get('thumbnailURL'), 'alt_title': video_info.get('secondary_title'), 'timestamp': parse_iso8601(video_info.get('publishedDate')), } for idx, video_info in enumerate(info['playlist'])] - return self.playlist_result(entries, video_id, video_title) + return self.playlist_result(entries, display_id, video_title) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 8ac8587be..028144f20 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -2,14 +2,10 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate, -) +from .zdf import ZDFIE -class DreiSatIE(InfoExtractor): +class DreiSatIE(ZDFIE): IE_NAME = '3sat' _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TESTS = [ @@ -35,53 +31,4 @@ class DreiSatIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - details_doc = self._download_xml(details_url, video_id, 'Downloading video details') - - status_code = details_doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, code) - raise ExtractorError(message, expected=True) - - thumbnail_els = details_doc.findall('.//teaserimage') - thumbnails = [{ - 'width': int(te.attrib['key'].partition('x')[0]), - 'height': int(te.attrib['key'].partition('x')[2]), - 'url': te.text, - } for te in thumbnail_els] - - information_el = details_doc.find('.//information') - video_title = information_el.find('./title').text - video_description = information_el.find('./detail').text - - details_el = details_doc.find('.//details') - video_uploader = details_el.find('./channel').text - upload_date = unified_strdate(details_el.find('./airtime').text) - - format_els = details_doc.findall('.//formitaet') - formats = [{ - 'format_id': fe.attrib['basetype'], - 'width': int(fe.find('./width').text), - 'height': int(fe.find('./height').text), - 'url': fe.find('./url').text, - 'filesize': int(fe.find('./filesize').text), - 'video_bitrate': int(fe.find('./videoBitrate').text), - } for fe in format_els - if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')] - - self._sort_formats(formats) - - return { - '_type': 'video', - 'id': video_id, - 'title': video_title, - 'formats': formats, - 'description': video_description, - 'thumbnails': thumbnails, - 'thumbnail': thumbnails[-1]['url'], - 'uploader': video_uploader, - 'upload_date': upload_date, - } + return self.extract_from_xml_url(video_id, details_url) diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py index 5dfea0d39..f7339702c 100644 --- a/youtube_dl/extractor/einthusan.py +++ b/youtube_dl/extractor/einthusan.py @@ -1,9 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + remove_start, + sanitized_Request, +) class EinthusanIE(InfoExtractor): @@ -34,27 +37,33 @@ class EinthusanIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + video_id = self._match_id(url) + + request = sanitized_Request(url) + request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0') + webpage = self._download_webpage(request, video_id) + + title = self._html_search_regex( + r'<h1><a[^>]+class=["\']movie-title["\'][^>]*>(.+?)</a></h1>', + webpage, 'title') - video_title = self._html_search_regex( - r'<h1><a class="movie-title".*?>(.*?)</a></h1>', webpage, 'title') + video_id = self._search_regex( + r'data-movieid=["\'](\d+)', webpage, 'video id', default=video_id) - video_url = self._html_search_regex( - r'''(?s)jwplayer\("mediaplayer"\)\.setup\({.*?'file': '([^']+)'.*?}\);''', - webpage, 'video url') + video_url = self._download_webpage( + 'http://cdn.einthusan.com/geturl/%s/hd/London,Washington,Toronto,Dallas,San,Sydney/' + % video_id, video_id) description = self._html_search_meta('description', webpage) thumbnail = self._html_search_regex( r'''<a class="movie-cover-wrapper".*?><img src=["'](.*?)["'].*?/></a>''', webpage, "thumbnail url", fatal=False) if thumbnail is not None: - thumbnail = thumbnail.replace('..', 'http://www.einthusan.com') + thumbnail = compat_urlparse.urljoin(url, remove_start(thumbnail, '..')) return { 'id': video_id, - 'title': video_title, + 'title': title, 'url': video_url, 'thumbnail': thumbnail, 'description': description, diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index e6f8f0337..3762d8748 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -1,18 +1,30 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import remove_end class ESPNIE(InfoExtractor): _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' - _WORKING = False _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', 'info_dict': { 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'ext': 'mp4', - 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'description': '', + 'title': '30 for 30 Shorts: Judging Jewell', + 'description': None, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season + 'url': 'http://espn.go.com/video/clip?id=2743663', + 'info_dict': { + 'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg', + 'ext': 'mp4', + 'title': 'Must-See Moments: Best of the MLS season', }, 'params': { # m3u8 download @@ -44,12 +56,23 @@ class ESPNIE(InfoExtractor): r'class="video-play-button"[^>]+data-id="(\d+)', webpage, 'video id') + cms = 'espn' + if 'data-source="intl"' in webpage: + cms = 'intl' + player_url = 'https://espn.go.com/video/iframe/twitter/?id=%s&cms=%s' % (video_id, cms) player = self._download_webpage( - 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id) + player_url, video_id) pcode = self._search_regex( r'["\']pcode=([^"\']+)["\']', player, 'pcode') - return self.url_result( - 'ooyalaexternal:espn:%s:%s' % (video_id, pcode), - 'OoyalaExternal') + title = remove_end( + self._og_search_title(webpage), + '- ESPN Video').strip() + + return { + '_type': 'url_transparent', + 'url': 'ooyalaexternal:%s:%s:%s' % (cms, video_id, pcode), + 'ie_key': 'OoyalaExternal', + 'title': title, + } diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py new file mode 100644 index 000000000..ab97b3196 --- /dev/null +++ b/youtube_dl/extractor/fox.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class FOXIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.fox.com/watch/255180355939/7684182528', + 'info_dict': { + 'id': '255180355939', + 'ext': 'mp4', + 'title': 'Official Trailer: Gotham', + 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', + 'duration': 129, + }, + 'add_ie': ['ThePlatform'], + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + release_url = self._parse_json(self._search_regex( + r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'), + video_id)['release_url'] + '&manifest=m3u' + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url(release_url, {'force_smil_url': True}), + 'id': video_id, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index d887583e6..e8bb527b8 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -167,14 +167,16 @@ class MTVServicesInfoExtractor(InfoExtractor): 'description': description, } + def _get_feed_query(self, uri): + data = {'uri': uri} + if self._LANG: + data['lang'] = self._LANG + return compat_urllib_parse.urlencode(data) + def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) - data = compat_urllib_parse.urlencode({'uri': uri}) - info_url = feed_url + '?' - if self._LANG: - info_url += 'lang=%s&' % self._LANG - info_url += data + info_url = feed_url + '?' + self._get_feed_query(uri) return self._get_videos_info_from_url(info_url, video_id) def _get_videos_info_from_url(self, url, video_id): @@ -184,9 +186,7 @@ class MTVServicesInfoExtractor(InfoExtractor): return self.playlist_result( [self._get_video_info(item) for item in idoc.findall('.//item')]) - def _real_extract(self, url): - title = url_basename(url) - webpage = self._download_webpage(url, title) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -207,7 +207,12 @@ class MTVServicesInfoExtractor(InfoExtractor): 'sm4:video:embed', webpage, 'sm4 embed', default='') mgid = self._search_regex( r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid') + return mgid + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) + mgid = self._extract_mgid(webpage) videos_info = self._get_videos_info(mgid) return videos_info diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py new file mode 100644 index 000000000..657ae77a0 --- /dev/null +++ b/youtube_dl/extractor/nextmovie.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor +from ..compat import compat_urllib_parse + + +class NextMovieIE(MTVServicesInfoExtractor): + IE_NAME = 'nextmovie.com' + _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' + _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' + _TESTS = [{ + 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', + 'md5': '09a9199f2f11f10107d04fcb153218aa', + 'info_dict': { + 'id': '961726', + 'ext': 'mp4', + 'title': 'The Muppets\' Gravity', + }, + }] + + def _get_feed_query(self, uri): + return compat_urllib_parse.urlencode({ + 'feed': '1505', + 'mgid': uri, + }) + + def _real_extract(self, url): + mgid = self._match_id(url) + return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py new file mode 100644 index 000000000..b62819ae5 --- /dev/null +++ b/youtube_dl/extractor/nick.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor +from ..compat import compat_urllib_parse + + +class NickIE(MTVServicesInfoExtractor): + IE_NAME = 'nick.com' + _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)' + _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' + _TESTS = [{ + 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', + 'playlist': [ + { + 'md5': '6e5adc1e28253bbb1b28ab05403dd4d4', + 'info_dict': { + 'id': 'be6a17b0-412d-11e5-8ff7-0026b9414f30', + 'ext': 'mp4', + 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S1', + 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', + + } + }, + { + 'md5': 'd7be441fc53a1d4882fa9508a1e5b3ce', + 'info_dict': { + 'id': 'be6b8f96-412d-11e5-8ff7-0026b9414f30', + 'ext': 'mp4', + 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S2', + 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', + + } + }, + { + 'md5': 'efffe1728a234b2b0d2f2b343dd1946f', + 'info_dict': { + 'id': 'be6cf7e6-412d-11e5-8ff7-0026b9414f30', + 'ext': 'mp4', + 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S3', + 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', + } + }, + { + 'md5': '1ec6690733ab9f41709e274a1d5c7556', + 'info_dict': { + 'id': 'be6e3354-412d-11e5-8ff7-0026b9414f30', + 'ext': 'mp4', + 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S4', + 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', + } + }, + ], + }] + + def _get_feed_query(self, uri): + return compat_urllib_parse.urlencode({ + 'feed': 'nick_arc_player_prime', + 'mgid': uri, + }) + + def _extract_mgid(self, webpage): + return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py new file mode 100644 index 000000000..9c4255a2d --- /dev/null +++ b/youtube_dl/extractor/ora.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + get_element_by_attribute, + qualities, + unescapeHTML, +) + + +class OraTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)' + _TEST = { + 'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq', + 'md5': 'fa33717591c631ec93b04b0e330df786', + 'info_dict': { + 'id': '50178', + 'ext': 'mp4', + 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!', + 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1', + 'duration': 1477, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_data = self._search_regex( + r'"current"\s*:\s*({[^}]+?})', webpage, 'current video') + m3u8_url = self._search_regex( + r'"hls_stream"\s*:\s*"([^"]+)', video_data, 'm3u8 url', None) + if m3u8_url: + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + # simular to GameSpotIE + m3u8_path = compat_urlparse.urlparse(m3u8_url).path + QUALITIES_RE = r'((,[a-z]+\d+)+,?)' + available_qualities = self._search_regex( + QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',') + http_path = m3u8_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/master.m3u8', '') + http_template = compat_urlparse.urljoin( + 'http://videocdn-pmd.ora.tv/', http_template) + preference = qualities( + ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080']) + for q in available_qualities: + formats.append({ + 'url': http_template % q, + 'format_id': q, + 'preference': preference(q), + }) + self._sort_formats(formats) + else: + return self.url_result(self._search_regex( + r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube') + + return { + 'id': self._search_regex( + r'"video_id"\s*:\s*(\d+)', video_data, 'video id'), + 'display_id': display_id, + 'title': unescapeHTML(self._og_search_title(webpage)), + 'description': get_element_by_attribute( + 'class', 'video_txt_decription', webpage), + 'thumbnail': self._proto_relative_url(self._search_regex( + r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)), + 'duration': int(self._search_regex( + r'"duration"\s*:\s*(\d+)', video_data, 'duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py new file mode 100644 index 000000000..8d49f5c4a --- /dev/null +++ b/youtube_dl/extractor/pandoratv.py @@ -0,0 +1,78 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + parse_duration, + str_to_int, +) + + +class PandoraTVIE(InfoExtractor): + IE_NAME = 'pandora.tv' + IE_DESC = '판도라TV' + _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?' + _TEST = { + 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', + 'info_dict': { + 'id': '53294230', + 'ext': 'flv', + 'title': '頭を撫でてくれる?', + 'description': '頭を撫でてくれる?', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 39, + 'upload_date': '20151218', + 'uploader': 'カワイイ動物まとめ', + 'uploader_id': 'mikakim', + 'view_count': int, + 'like_count': int, + } + } + + def _real_extract(self, url): + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('prgid', [None])[0] + user_id = qs.get('ch_userid', [None])[0] + if any(not f for f in (video_id, user_id,)): + raise ExtractorError('Invalid URL', expected=True) + + data = self._download_json( + 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' + % (user_id, video_id), video_id) + + info = data['data']['rows']['vod_play_info']['result'] + + formats = [] + for format_id, format_url in info.items(): + if not format_url: + continue + height = self._search_regex( + r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'format_id': '%sp' % height, + 'url': format_url, + 'height': int(height), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info['subject'], + 'description': info.get('body'), + 'thumbnail': info.get('thumbnail') or info.get('poster'), + 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), + 'upload_date': info['fid'][:8] if isinstance(info.get('fid'), compat_str) else None, + 'uploader': info.get('nickname'), + 'uploader_id': info.get('upload_userid'), + 'view_count': str_to_int(info.get('hit')), + 'like_count': str_to_int(info.get('likecnt')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 1ba3bbddf..45a3c41c5 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -11,6 +11,7 @@ from ..utils import ( strip_jsonp, unescapeHTML, clean_html, + ExtractorError, ) @@ -177,7 +178,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', - 'description': 'md5:2a222d89ba4455a3af19940c0481bb78', + 'description': 'md5:870ec08f7d8547c29c93010899103751', }, 'playlist_count': 12, } @@ -272,7 +273,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): 'url': 'http://y.qq.com/#type=toplist&p=top_3', 'info_dict': { 'id': 'top_3', - 'title': 'QQ音乐巅峰榜·欧美', + 'title': '巅峰榜·欧美', 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' @@ -315,7 +316,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_DESC = 'QQ音乐 - 歌单' _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://y.qq.com/#type=taoge&id=3462654915', 'info_dict': { 'id': '3462654915', @@ -323,7 +324,16 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', }, 'playlist_count': 40, - } + 'skip': 'playlist gone', + }, { + 'url': 'http://y.qq.com/#type=taoge&id=1374105607', + 'info_dict': { + 'id': '1374105607', + 'title': '易入人心的华语民谣', + 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', + }, + 'playlist_count': 20, + }] def _real_extract(self, url): list_id = self._match_id(url) @@ -331,14 +341,21 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): list_json = self._download_json( 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' % list_id, list_id, 'Download list page', - transform_source=strip_jsonp)['cdlist'][0] - + transform_source=strip_jsonp) + if not len(list_json.get('cdlist', [])): + if list_json.get('code'): + raise ExtractorError( + 'QQ Music said: error %d in fetching playlist info' % list_json['code'], + expected=True) + raise ExtractorError('Unable to get playlist info') + + cdlist = list_json['cdlist'][0] entries = [ self.url_result( 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] - ) for song in list_json['songlist'] + ) for song in cdlist['songlist'] ] - list_name = list_json.get('dissname') - list_description = clean_html(unescapeHTML(list_json.get('desc'))) + list_name = cdlist.get('dissname') + list_description = clean_html(unescapeHTML(cdlist.get('desc'))) return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/regiotv.py b/youtube_dl/extractor/regiotv.py new file mode 100644 index 000000000..e250a52f0 --- /dev/null +++ b/youtube_dl/extractor/regiotv.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + sanitized_Request, + xpath_text, + xpath_with_ns, +) + + +class RegioTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.regio-tv.de/video/395808.html', + 'info_dict': { + 'id': '395808', + 'ext': 'mp4', + 'title': 'Wir in Ludwigsburg', + 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', + } + }, { + 'url': 'http://www.regio-tv.de/video/395808', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + key = self._search_regex( + r'key\s*:\s*(["\'])(?P<key>.+?)\1', webpage, 'key', group='key') + title = self._og_search_title(webpage) + + SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>' + + request = sanitized_Request( + 'http://v.telvi.de/', + SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) + video_data = self._download_xml(request, video_id, 'Downloading video XML') + + NS_MAP = { + 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', + 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', + } + + video_url = xpath_text( + video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) + thumbnail = xpath_text( + video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') + description = self._og_search_description( + webpage) or self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py new file mode 100644 index 000000000..b1b8800b9 --- /dev/null +++ b/youtube_dl/extractor/revision3.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + unescapeHTML, + qualities, +) + + +class Revision3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' + _TESTS = [{ + 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', + 'md5': 'd94a72d85d0a829766de4deb8daaf7df', + 'info_dict': { + 'id': '73034', + 'display_id': 'technobuffalo/5-google-predictions-for-2016', + 'ext': 'webm', + 'title': '5 Google Predictions for 2016', + 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.', + 'upload_date': '20151228', + 'timestamp': 1451325600, + 'duration': 187, + 'uploader': 'TechnoBuffalo', + 'uploader_id': 'technobuffalo', + } + }, { + 'url': 'http://testtube.com/brainstuff', + 'info_dict': { + 'id': '251', + 'title': 'BrainStuff', + 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.', + }, + 'playlist_mincount': 93, + }, { + 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', + 'info_dict': { + 'id': '60163', + 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', + 'duration': 275, + 'ext': 'webm', + 'title': '5 Weird Ways Plants Can Eat Animals', + 'description': 'Why have some plants evolved to eat meat?', + 'upload_date': '20150120', + 'timestamp': 1421763300, + 'uploader': 'DNews', + 'uploader_id': 'dnews', + }, + }] + _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' + _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + page_info = self._download_json( + self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) + + if page_info['data']['type'] == 'episode': + episode_data = page_info['data'] + video_id = compat_str(episode_data['video']['data']['id']) + video_data = self._download_json( + 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), + video_id)['items'][0] + + formats = [] + for vcodec, media in video_data['media'].items(): + for quality_id, quality in media.items(): + if quality_id == 'hls': + formats.extend(self._extract_m3u8_formats( + quality['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': quality['url'], + 'format_id': '%s-%s' % (vcodec, quality_id), + 'tbr': int_or_none(quality.get('bitrate')), + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + preference = qualities(['mini', 'small', 'medium', 'large']) + thumbnails = [{ + 'url': image_url, + 'id': image_id, + 'preference': preference(image_id) + } for image_id, image_url in video_data.get('images', {}).items()] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': unescapeHTML(video_data['title']), + 'description': unescapeHTML(video_data.get('summary')), + 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '), + 'author': episode_data.get('author'), + 'uploader': video_data.get('show', {}).get('name'), + 'uploader_id': video_data.get('show', {}).get('slug'), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + } + else: + show_data = page_info['show']['data'] + episodes_data = page_info['episodes']['data'] + num_episodes = page_info['meta']['totalEpisodes'] + processed_episodes = 0 + entries = [] + page_num = 1 + while True: + entries.extend([self.url_result( + 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data]) + processed_episodes += len(episodes_data) + if processed_episodes == num_episodes: + break + page_num += 1 + episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % ( + domain, display_id + '/' + compat_str(page_num), domain), + display_id)['episodes']['data'] + + return self.playlist_result( + entries, compat_str(show_data['id']), + show_data.get('name'), show_data.get('summary')) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 41fddc375..ffea438cc 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -75,9 +75,12 @@ class RuutuIE(InfoExtractor): preference = -1 if proto == 'rtmp' else 1 label = child.get('label') tbr = int_or_none(child.get('bitrate')) + format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto + if not self._is_valid_url(video_url, video_id, format_id): + continue width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] formats.append({ - 'format_id': '%s-%s' % (proto, label if label else tbr), + 'format_id': format_id, 'url': video_url, 'width': width, 'height': height, diff --git a/youtube_dl/extractor/testtube.py b/youtube_dl/extractor/testtube.py deleted file mode 100644 index 26655d690..000000000 --- a/youtube_dl/extractor/testtube.py +++ /dev/null @@ -1,90 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, -) - - -class TestTubeIE(InfoExtractor): - _VALID_URL = r'https?://testtube\.com/[^/?#]+/(?P<id>[^/?#]+)' - _TESTS = [{ - 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', - 'info_dict': { - 'id': '60163', - 'display_id': '5-weird-ways-plants-can-eat-animals', - 'duration': 275, - 'ext': 'webm', - 'title': '5 Weird Ways Plants Can Eat Animals', - 'description': 'Why have some plants evolved to eat meat?', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'DNews', - 'uploader_id': 'dnews', - }, - }, { - 'url': 'https://testtube.com/iflscience/insane-jet-ski-flipping', - 'info_dict': { - 'id': 'fAGfJ4YjVus', - 'ext': 'mp4', - 'title': 'Flipping Jet-Ski Skills | Outrageous Acts of Science', - 'uploader': 'Science Channel', - 'uploader_id': 'ScienceChannel', - 'upload_date': '20150203', - 'description': 'md5:e61374030015bae1d2e22f096d4769d6', - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - webpage, 'youtube iframe', default=None) - if youtube_url: - return self.url_result(youtube_url, 'Youtube', video_id=display_id) - - video_id = self._search_regex( - r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);", - webpage, 'video ID') - - all_info = self._download_json( - 'https://testtube.com/api/getPlaylist.json?api_key=ba9c741bce1b9d8e3defcc22193f3651b8867e62&codecs=h264,vp8,theora&video_id=%s' % video_id, - video_id) - info = all_info['items'][0] - - formats = [] - for vcodec, fdatas in info['media'].items(): - for name, fdata in fdatas.items(): - formats.append({ - 'format_id': '%s-%s' % (vcodec, name), - 'url': fdata['url'], - 'vcodec': vcodec, - 'tbr': fdata.get('bitrate'), - }) - self._sort_formats(formats) - - duration = int_or_none(info.get('duration')) - images = info.get('images') - thumbnails = None - preference = qualities(['mini', 'small', 'medium', 'large']) - if images: - thumbnails = [{ - 'id': thumbnail_id, - 'url': img_url, - 'preference': preference(thumbnail_id) - } for thumbnail_id, img_url in images.items()] - - return { - 'id': video_id, - 'display_id': display_id, - 'title': info['title'], - 'description': info.get('summary'), - 'thumbnails': thumbnails, - 'uploader': info.get('show', {}).get('name'), - 'uploader_id': info.get('show', {}).get('slug'), - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index d6d038a8d..adc05ed5f 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -4,32 +4,9 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE -from .discovery import DiscoveryIE from ..compat import compat_urlparse -class TlcIE(DiscoveryIE): - IE_NAME = 'tlc.com' - _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' - - # DiscoveryIE has _TESTS - _TESTS = [{ - 'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm', - 'info_dict': { - 'id': '104493', - 'ext': 'mp4', - 'title': 'Too Big to Fly', - 'description': 'Buddy has taken on a high flying task.', - 'duration': 119, - 'timestamp': 1393365060, - 'upload_date': '20140225', - }, - 'params': { - 'skip_download': True, # requires ffmpef - }, - }] - - class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)' diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py new file mode 100644 index 000000000..b73279dec --- /dev/null +++ b/youtube_dl/extractor/tvland.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class TVLandIE(MTVServicesInfoExtractor): + IE_NAME = 'tvland.com' + _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' + _FEED_URL = 'http://www.tvland.com/feeds/mrss/' + _TESTS = [{ + 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048', + 'playlist': [ + { + 'md5': '227e9723b9669c05bf51098b10287aa7', + 'info_dict': { + 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd', + 'ext': 'mp4', + 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5', + } + }, + { + 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b', + 'info_dict': { + 'id': 'f4279548-6e13-40dd-92e8-860d27289197', + 'ext': 'mp4', + 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5', + } + }, + { + 'md5': 'fde4c3bccd7cc7e3576b338734153cec', + 'info_dict': { + 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334', + 'ext': 'mp4', + 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5', + } + }, + { + 'md5': '247f6780cda6891f2e49b8ae2b10e017', + 'info_dict': { + 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960', + 'ext': 'mp4', + 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5', + } + }, + { + 'md5': 'fd269f33256e47bad5eb6c40de089ff6', + 'info_dict': { + 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7', + 'ext': 'mp4', + 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5', + } + } + ], + }, { + 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies', + 'md5': 'e2c6389401cf485df26c79c247b08713', + 'info_dict': { + 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88', + 'ext': 'mp4', + 'title': 'Younger|Younger: Hilary Duff - Little Lies', + 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269' + }, + }] diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index fca5ddc69..4a492f784 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -155,7 +155,16 @@ class YahooIE(InfoExtractor): 'description': 'md5:8fc39608213295748e1e289807838c97', 'duration': 1646, }, - } + }, { + # it uses an alias to get the video_id + 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html', + 'info_dict': { + 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737', + 'ext': 'mp4', + 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking', + 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.', + }, + }, ] def _real_extract(self, url): @@ -199,13 +208,22 @@ class YahooIE(InfoExtractor): r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) if items_json is None: - CONTENT_ID_REGEXES = [ - r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', - r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', - r'"first_videoid"\s*:\s*"([^"]+)"', - r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), - ] - video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID') + alias = self._search_regex( + r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None) + if alias is not None: + alias_info = self._download_json( + 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias, + display_id, 'Downloading alias info') + video_id = alias_info[0]['id'] + else: + CONTENT_ID_REGEXES = [ + r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', + r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', + r'"first_videoid"\s*:\s*"([^"]+)"', + r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), + ] + video_id = self._search_regex( + CONTENT_ID_REGEXES, webpage, 'content ID') else: items = json.loads(items_json) info = items['mediaItems']['query']['results']['mediaObj'][0] diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 3a3432be8..f767fa15f 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,6 +2,9 @@ from __future__ import unicode_literals import base64 +import random +import string +import time from .common import InfoExtractor from ..compat import ( @@ -141,6 +144,11 @@ class YoukuIE(InfoExtractor): return video_urls_dict + @staticmethod + def get_ysuid(): + return '%d%s' % (int(time.time()), ''.join([ + random.choice(string.ascii_letters) for i in range(3)])) + def get_hd(self, fm): hd_id_dict = { '3gp': '0', @@ -189,6 +197,8 @@ class YoukuIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + self._set_cookie('youku.com', '__ysuid', self.get_ysuid()) + def retrieve_data(req_url, note): headers = { 'Referer': req_url, diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 2a1f2f6d1..c619a75e2 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -13,6 +13,7 @@ from ..utils import ( determine_ext, qualities, float_or_none, + ExtractorError, ) @@ -59,7 +60,6 @@ class ZDFIE(InfoExtractor): 'ext': 'flv', 'format_id': '%s-%d' % (proto, bitrate), 'tbr': bitrate, - 'protocol': proto, }) self._sort_formats(formats) return formats @@ -70,6 +70,15 @@ class ZDFIE(InfoExtractor): note='Downloading video info', errnote='Failed to download video info') + status_code = doc.find('./status/statuscode') + if status_code is not None and status_code.text != 'ok': + code = status_code.text + if code == 'notVisibleAnymore': + message = 'Video %s is not available' % video_id + else: + message = '%s returned error: %s' % (self.IE_NAME, code) + raise ExtractorError(message, expected=True) + title = doc.find('.//information/title').text description = xpath_text(doc, './/information/detail', 'description') duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) @@ -129,10 +138,10 @@ class ZDFIE(InfoExtractor): video_url, video_id, fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) + video_url, video_id, f4m_id=format_id, fatal=False)) else: proto = format_m.group('proto').lower() diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a62baa305..790bd5b3b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.12.29' +__version__ = '2016.01.01' |