diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/chilloutzone.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/cnn.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/drtv.py | 37 | ||||
-rw-r--r-- | youtube_dl/extractor/mitele.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/nba.py | 20 | ||||
-rw-r--r-- | youtube_dl/extractor/nextmedia.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/prosiebensat1.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/rtve.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/rutv.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/shared.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/sportbox.py | 26 | ||||
-rw-r--r-- | youtube_dl/extractor/telecinco.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/tutv.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/videott.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/viki.py | 5 |
16 files changed, 94 insertions, 66 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 24efb7ce5..8bb3926a0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -338,8 +338,7 @@ from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") + decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor): restricted_to_denmark = asset['RestrictedToDenmark'] spoken_subtitles = asset['Target'] == 'SpokenSubtitles' for link in asset['Links']: - target = link['Target'] uri = link['Uri'] + target = link['Target'] format_id = target - preference = -1 if target == 'HDS' else -2 + preference = None if spoken_subtitles: - preference -= 2 + preference = -1 format_id += '-spoken-subtitles' - formats.append({ - 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, - 'format_id': format_id, - 'ext': link['FileFormat'], - 'preference': preference, - }) + if target == 'HDS': + formats.extend(self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id)) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', preference=preference, + m3u8_id=format_id)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': link.get('FileFormat'), + }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index d8897eb90..7091f3335 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '6a75fe9d0d3275bead0cb683c616fddb', 'info_dict': { 'id': '0fce117d', 'ext': 'mp4', @@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor): 'display_id': 'programa-144', 'duration': 2913, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor): episode, transform_source=strip_jsonp ) + formats = self._extract_m3u8_formats( + token_info['tokenizedUrl'], episode, ext='mp4') return { 'id': embed_data['videoId'], 'display_id': episode, 'title': info_el.find('title').text, - 'url': token_info['tokenizedUrl'], + 'formats': formats, 'description': get_element_by_attribute('class', 'text', webpage), 'thumbnail': info_el.find('thumb').text, 'duration': parse_duration(info_el.find('duration').text), diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 862b706bf..944096e1c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -22,6 +22,18 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, + }, { + 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'info_dict': { + 'id': '0041400301-cle-atl-recap.nba', + 'ext': 'mp4', + 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', + 'duration': 228, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): @@ -35,8 +47,12 @@ class NBAIE(InfoExtractor): self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') description = self._og_search_description(webpage) - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration')) + duration_str = self._html_search_meta( + 'duration', webpage, 'duration', default=None) + if not duration_str: + duration_str = self._html_search_regex( + r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) + duration = parse_duration(duration_str) return { 'id': shortened_video_id, diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE): return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): + _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:b23787119933404ce515c6356a8c355c', + 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', + 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } - }] - - _URL_PATTERN = r'\{url: \'(.+)\'\}' - - def _fetch_title(self, page): - return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title') - - def _fetch_thumbnail(self, page): - return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - - def _fetch_timestamp(self, page): - return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): - _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' - _TESTS = [{ + }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', 'info_dict': { @@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): 'expected_warnings': [ 'video thumbnail', ] + }, { + 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', + 'only_matching': True, }] + _URL_PATTERN = r'\{url: \'(.+)\'\}' + def _fetch_title(self, page): - return self._html_search_meta('description', page, 'news title') + return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or + self._html_search_meta('description', page, 'news title')) + + def _fetch_thumbnail(self, page): + return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + + def _fetch_timestamp(self, page): + return None def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc799664..255d4abc1 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -17,7 +17,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 849300140..82cd98ac7 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ from ..utils import ( def _decrypt_url(png): - encrypted_data = base64.b64decode(png) + encrypted_data = base64.b64decode(png.encode('utf-8')) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] length = struct_unpack('!I', text_chunk[:4])[0] diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index 55604637d..d9df06861 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor): @classmethod def _extract_url(cls, webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 26ced716e..9f3e944e7 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -47,7 +47,7 @@ class SharedIE(InfoExtractor): video_url = self._html_search_regex( r'data-url="([^"]+)"', video_page, 'video URL') title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title')).decode('utf-8') + 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 8686f9d11..86d509ae5 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -6,8 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - parse_duration, - parse_iso8601, + unified_strdate, ) @@ -20,11 +19,9 @@ class SportBoxIE(InfoExtractor): 'id': '80822', 'ext': 'mp4', 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', + 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1411896237, 'upload_date': '20140928', - 'duration': 4846, }, 'params': { # m3u8 download @@ -48,17 +45,13 @@ class SportBoxIE(InfoExtractor): r'src="/?(vdl/player/[^"]+)"', webpage, 'player') title = self._html_search_regex( - r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') - description = self._html_search_regex( - r'(?s)<div itemprop="description">(.+?)</div>', - webpage, 'description', fatal=False) + [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], + webpage, 'title') + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'<span itemprop="uploadDate">([^<]+)</span>', - webpage, 'timestamp', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'<meta itemprop="duration" content="PT([^"]+)">', - webpage, 'duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'dateCreated', webpage, 'upload date')) return { '_type': 'url_transparent', @@ -67,8 +60,7 @@ class SportBoxIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 251a68680..a0c744fd1 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE): 'title': 'Con Martín Berasategui, hacer un bacalao al ...', 'duration': 662, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4de0aac52..fad720b68 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -26,7 +26,7 @@ class TutvIE(InfoExtractor): data_content = self._download_webpage( 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') + video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8') return { 'id': internal_id, diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index ececc7ee0..591024ead 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor): formats = [ { - 'url': base64.b64decode(res['u']).decode('utf-8'), + 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'), 'ext': 'flv', 'format_id': res['l'], } for res in settings['res'] if res['u'] diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index fe7229952..7f2fb1ca8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -15,6 +15,7 @@ from .common import InfoExtractor class VikiBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' @@ -58,7 +59,7 @@ class VikiBaseIE(InfoExtractor): class VikiIE(VikiBaseIE): IE_NAME = 'viki' - _VALID_URL = r'https?://(?:www\.)?viki\.com/(?:videos|player)/(?P<id>[0-9]+v)' + _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -229,7 +230,7 @@ class VikiIE(VikiBaseIE): class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' - _VALID_URL = r'https?://(?:www\.)?viki\.com/(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' + _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', 'info_dict': { |