diff options
-rw-r--r-- | test/helper.py | 3 | ||||
-rw-r--r-- | test/test_InfoExtractor.py | 10 | ||||
-rw-r--r-- | test/test_utils.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/firsttv.py | 139 | ||||
-rw-r--r-- | youtube_dl/extractor/funnyordie.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/internetvideoarchive.py | 118 | ||||
-rw-r--r-- | youtube_dl/extractor/rottentomatoes.py | 24 | ||||
-rw-r--r-- | youtube_dl/extractor/videodetective.py | 9 |
8 files changed, 202 insertions, 113 deletions
diff --git a/test/helper.py b/test/helper.py index f2d878212..b8e22c5cb 100644 --- a/test/helper.py +++ b/test/helper.py @@ -143,6 +143,9 @@ def expect_value(self, got, expected, field): expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): + self.assertTrue( + isinstance(got, compat_str), + 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) elif isinstance(expected, compat_str) and expected.startswith('mincount:'): self.assertTrue( diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 938466a80..6404ac89f 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError class TestIE(InfoExtractor): @@ -66,5 +67,14 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + def test_download_json(self): + uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') + self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) + uri = encode_data_uri(b'callback({"foo": "blah"})', 'application/javascript') + self.assertEqual(self.ie._download_json(uri, None, transform_source=strip_jsonp), {'foo': 'blah'}) + uri = encode_data_uri(b'{"foo": invalid}', 'application/json') + self.assertRaises(ExtractorError, self.ie._download_json, uri, None) + self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index a35debfe1..0f36bb9f0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ from youtube_dl.utils import ( args_to_str, encode_base_n, clean_html, + date_from_str, DateRange, detect_exe_version, determine_ext, @@ -234,6 +235,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + def test_date_from_str(self): + self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) + self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week')) + self.assertEqual(date_from_str('now+14day'), date_from_str('now+2week')) + self.assertEqual(date_from_str('now+365day'), date_from_str('now+1year')) + self.assertEqual(date_from_str('now+30day'), date_from_str('now+1month')) + def test_daterange(self): _20century = DateRange("19000101", "20000101") self.assertFalse("17890714" in _20century) diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 98b165143..88bca1007 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,78 +2,133 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_xpath +from ..utils import ( + int_or_none, + qualities, + unified_strdate, + xpath_attr, + xpath_element, + xpath_text, + xpath_with_ns, +) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.1tv.ru/videoarchive/73390', - 'md5': '777f525feeec4806130f4f764bc18a4f', + # single format via video_materials.json API + 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', + 'md5': '82a2777648acae812d58b3f5bd42882b', 'info_dict': { - 'id': '73390', + 'id': '35930', 'ext': 'mp4', - 'title': 'Олимпийские канатные дороги', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'description': 'md5:357933adeede13b202c7c21f91b871b2', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'duration': 149, - 'like_count': int, - 'dislike_count': int, + 'upload_date': '20150212', + 'duration': 2694, }, - 'skip': 'Only works from Russia', }, { - 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + # multiple formats via video_materials.json API + 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', 'info_dict': { - 'id': '35930', + 'id': '113641', 'ext': 'mp4', - 'title': 'Наедине со всеми. Людмила Сенчина', - 'description': 'md5:89553aed1d641416001fe8d450f06cb9', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'duration': 2694, + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Only works from Russia', + }, { + # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API + 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', + 'md5': '519d306c5b5669761fd8906c39dbee23', + 'info_dict': { + 'id': '47038', + 'ext': 'mp4', + 'title': '"Побег". Второй сезон. 3 серия', + 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20120516', + 'duration': 3080, + }, + }, { + 'url': 'http://www.1tv.ru/videoarchive/9967', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, 'Downloading page') + # Videos with multiple formats only available via this API + video = self._download_json( + 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, + video_id, fatal=False) - video_url = self._html_search_regex( - r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''', - webpage, 'video URL') + description, thumbnail, upload_date, duration = [None] * 4 - title = self._html_search_regex( - [r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', - r"'title'\s*:\s*'([^']+)'"], webpage, 'title') - description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description') + if video: + item = video[0] + title = item['title'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [{ + 'url': f['src'], + 'format_id': f.get('name'), + 'quality': quality(f.get('name')), + } for f in item['mbr'] if f.get('src')] + thumbnail = item.get('poster') + else: + # Some videos are not available via video_materials.json + video = self._download_xml( + 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, + video_id) + + NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + } - thumbnail = self._og_search_thumbnail(webpage) - duration = self._og_search_property( - 'video:duration', webpage, - 'video duration', fatal=False) + item = xpath_element(video, './channel/item', fatal=True) + title = xpath_text(item, './title', fatal=True) + formats = [{ + 'url': content.attrib['url'], + } for content in item.findall( + compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] + thumbnail = xpath_attr( + item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') - like_count = self._html_search_regex( - r'title="Понравилось".*?/></label> \[(\d+)\]', - webpage, 'like count', default=None) - dislike_count = self._html_search_regex( - r'title="Не понравилось".*?/></label> \[(\d+)\]', - webpage, 'dislike count', default=None) + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) + if webpage: + title = self._html_search_regex( + (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or title + description = self._html_search_regex( + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description') + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'url': video_url, 'thumbnail': thumbnail, 'title': title, 'description': description, + 'upload_date': upload_date, 'duration': int_or_none(duration), - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), + 'formats': formats } diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4c4a87e2a..8c5ffc9e8 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -46,8 +46,8 @@ class FunnyOrDieIE(InfoExtractor): links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) m3u8_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1', - webpage, 'm3u8 url', default=None, group='url') + r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1', + webpage, 'm3u8 url', group='url') formats = [] diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index e60145b3d..45add007f 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,93 +1,91 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urlparse, - compat_urllib_parse_urlencode, ) from ..utils import ( - xpath_with_ns, + determine_ext, + int_or_none, + xpath_text, ) class InternetVideoArchiveIE(InfoExtractor): - _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' _TEST = { - 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', 'info_dict': { - 'id': '452693', + 'id': '194487', 'ext': 'mp4', - 'title': 'SKYFALL', - 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - 'duration': 152, + 'title': 'KICK-ASS 2', + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @staticmethod - def _build_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + def _build_json_url(query): + return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query @staticmethod - def _clean_query(query): - NEEDED_ARGS = ['publishedid', 'customerid'] - query_dic = compat_urlparse.parse_qs(query) - cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS) - # Other player ids return m3u8 urls - cleaned_dic['playerid'] = '247' - cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse_urlencode(cleaned_dic) + def _build_xml_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query def _real_extract(self, url): query = compat_urlparse.urlparse(url).query - query_dic = compat_urlparse.parse_qs(query) + query_dic = compat_parse_qs(query) video_id = query_dic['publishedid'][0] - url = self._build_url(query) - flashconfiguration = self._download_xml(url, video_id, - 'Downloading flash configuration') - file_url = flashconfiguration.find('file').text - file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') - # Replace some of the parameters in the query to get the best quality - # and http links (no m3u8 manifests) - file_url = re.sub(r'(?<=\?)(.+)$', - lambda m: self._clean_query(m.group()), - file_url) - info = self._download_xml(file_url, video_id, - 'Downloading video info') - item = info.find('channel/item') + if '/player/' in url: + configuration = self._download_json(url, video_id) + + # There are multiple videos in the playlist whlie only the first one + # matches the video played in browsers + video_info = configuration['playlist'][0] + + formats = [] + for source in video_info['sources']: + file_url = source['file'] + if determine_ext(file_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, ext='mp4', m3u8_id='hls')) + else: + a_format = { + 'url': file_url, + } + + if source.get('label') and source['label'][-4:] == ' kbs': + tbr = int_or_none(source['label'][:-4]) + a_format.update({ + 'tbr': tbr, + 'format_id': 'http-%d' % tbr, + }) + formats.append(a_format) - def _bp(p): - return xpath_with_ns( - p, - { - 'media': 'http://search.yahoo.com/mrss/', - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats', - } - ) - formats = [] - for content in item.findall(_bp('media:group/media:content')): - attr = content.attrib - f_url = attr['url'] - width = int(attr['width']) - bitrate = int(attr['bitrate']) - format_id = '%d-%dk' % (width, bitrate) - formats.append({ - 'format_id': format_id, - 'url': f_url, - 'width': width, - 'tbr': bitrate, - }) + self._sort_formats(formats) - self._sort_formats(formats) + title = video_info['title'] + description = video_info.get('description') + thumbnail = video_info.get('image') + else: + configuration = self._download_xml(url, video_id) + formats = [{ + 'url': xpath_text(configuration, './file', 'file URL', fatal=True), + }] + thumbnail = xpath_text(configuration, './image', 'thumbnail') + title = 'InternetVideoArchive video %s' % video_id + description = None return { 'id': video_id, - 'title': item.find('title').text, + 'title': title, 'formats': formats, - 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], - 'description': item.find('description').text, - 'duration': int(attr['duration']), + 'thumbnail': thumbnail, + 'description': description, } diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index e8bb20a08..f9cd48790 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,11 +1,11 @@ from __future__ import unicode_literals -from .videodetective import VideoDetectiveIE +from .common import InfoExtractor +from ..compat import compat_urlparse +from .internetvideoarchive import InternetVideoArchiveIE -# It just uses the same method as videodetective.com, -# the internetvideoarchive.com is extracted from the og:video property -class RottenTomatoesIE(VideoDetectiveIE): +class RottenTomatoesIE(InfoExtractor): _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' _TEST = { @@ -13,7 +13,19 @@ class RottenTomatoesIE(VideoDetectiveIE): 'info_dict': { 'id': '613340', 'ext': 'mp4', - 'title': 'TOY STORY 3', - 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'title': 'Toy Story 3', }, } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + og_video = self._og_search_video_url(webpage) + query = compat_urlparse.urlparse(og_video).query + + return { + '_type': 'url_transparent', + 'url': InternetVideoArchiveIE._build_xml_url(query), + 'ie_key': InternetVideoArchiveIE.ie_key(), + 'title': self._og_search_title(webpage), + } diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 0ffc7ff7d..2ed5d9643 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -14,8 +14,11 @@ class VideoDetectiveIE(InfoExtractor): 'id': '194487', 'ext': 'mp4', 'title': 'KICK-ASS 2', - 'description': 'md5:65ba37ad619165afac7d432eaded6013', - 'duration': 138, + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @@ -24,4 +27,4 @@ class VideoDetectiveIE(InfoExtractor): webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage) query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key()) + return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key()) |