diff options
28 files changed, 785 insertions, 137 deletions
@@ -116,3 +116,4 @@ Duncan Keall Alexander Mamay Devin J. Pohly Eduardo Ferro Aldama +Jeff Buchbinder diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py new file mode 100644 index 000000000..ff66449eb --- /dev/null +++ b/devscripts/generate_aes_testdata.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import codecs +import subprocess + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import intlist_to_bytes +from youtube_dl.aes import aes_encrypt, key_expansion + +secret_msg = b'Secret message goes here' + + +def hex_str(int_list): + return codecs.encode(intlist_to_bytes(int_list), 'hex') + + +def openssl_encode(algo, key, iv): + cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] + prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + out, _ = prog.communicate(secret_msg) + return out + +iv = key = [0x20, 0x15] + 14 * [0] + +r = openssl_encode('aes-128-cbc', key, iv) +print('aes_cbc_decrypt') +print(repr(r)) + +password = key +new_key = aes_encrypt(password, key_expansion(password)) +r = openssl_encode('aes-128-ctr', new_key, iv) +print('aes_decrypt_text') +print(repr(r)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d6a1e67c6..72b365305 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -112,6 +112,7 @@ - **Discovery** - **divxstage**: DivxStage - **Dotsub** + - **DouyuTV** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -342,6 +343,7 @@ - **PornHubPlaylist** - **Pornotube** - **PornoXO** + - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital - **Puls4** @@ -367,6 +369,7 @@ - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **RUHD** - **rutube**: Rutube videos @@ -487,6 +490,7 @@ - **Ubu** - **udemy** - **udemy:course** + - **Ultimedia** - **Unistra** - **Urort**: NRK P3 Urørt - **ustream** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index db8a47d2d..652519831 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -14,6 +14,7 @@ from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor +from youtube_dl.utils import match_filter_func TEST_URL = 'http://localhost/sample.mp4' @@ -339,6 +340,8 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + +class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): def s_formats(lang, autocaption=False): return [{ @@ -461,6 +464,73 @@ class TestFormatSelection(unittest.TestCase): self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(audiofile) + def test_match_filter(self): + class FilterYDL(YDL): + def __init__(self, *args, **kwargs): + super(FilterYDL, self).__init__(*args, **kwargs) + self.params['simulate'] = True + + def process_info(self, info_dict): + super(YDL, self).process_info(info_dict) + + def _match_entry(self, info_dict, incomplete): + res = super(FilterYDL, self)._match_entry(info_dict, incomplete) + if res is None: + self.downloaded_info_dicts.append(info_dict) + return res + + first = { + 'id': '1', + 'url': TEST_URL, + 'title': 'one', + 'extractor': 'TEST', + 'duration': 30, + 'filesize': 10 * 1024, + } + second = { + 'id': '2', + 'url': TEST_URL, + 'title': 'two', + 'extractor': 'TEST', + 'duration': 10, + 'description': 'foo', + 'filesize': 5 * 1024, + } + videos = [first, second] + + def get_videos(filter_=None): + ydl = FilterYDL({'match_filter': filter_}) + for v in videos: + ydl.process_ie_result(v, download=True) + return [v['id'] for v in ydl.downloaded_info_dicts] + + res = get_videos() + self.assertEqual(res, ['1', '2']) + + def f(v): + if v['id'] == '1': + return None + else: + return 'Video id is not 1' + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func('duration < 30') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description = foo') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description =? foo') + res = get_videos(f) + self.assertEqual(res, ['1', '2']) + + f = match_filter_func('filesize > 5KiB') + res = get_videos(f) + self.assertEqual(res, ['1']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py new file mode 100644 index 000000000..111b902e1 --- /dev/null +++ b/test/test_aes.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +import base64 + +# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' + + +class TestAES(unittest.TestCase): + def setUp(self): + self.key = self.iv = [0x20, 0x15] + 14 * [0] + self.secret_msg = b'Secret message goes here' + + def test_encrypt(self): + msg = b'message' + key = list(range(16)) + encrypted = aes_encrypt(bytes_to_intlist(msg), key) + decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) + self.assertEqual(decrypted, msg) + + def test_cbc_decrypt(self): + data = bytes_to_intlist( + b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" + ) + decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + + def test_decrypt_text(self): + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' + ) + decrypted = (aes_decrypt_text(encrypted, password, 16)) + self.assertEqual(decrypted, self.secret_msg) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_http.py b/test/test_http.py index bd4d46fef..f2e305b6f 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server +from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl import threading @@ -68,5 +68,52 @@ class TestHTTP(unittest.TestCase): r = ydl.extract_info('https://localhost:%d/video.html' % self.port) self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) + +def _build_proxy_handler(name): + class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + proxy_name = name + + def log_message(self, format, *args): + pass + + def do_GET(self): + self.send_response(200) + self.send_header('Content-Type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8')) + return HTTPTestRequestHandler + + +class TestProxy(unittest.TestCase): + def setUp(self): + self.proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('normal')) + self.port = self.proxy.socket.getsockname()[1] + self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) + self.proxy_thread.daemon = True + self.proxy_thread.start() + + self.cn_proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('cn')) + self.cn_port = self.cn_proxy.socket.getsockname()[1] + self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) + self.cn_proxy_thread.daemon = True + self.cn_proxy_thread.start() + + def test_proxy(self): + cn_proxy = 'localhost:{0}'.format(self.cn_port) + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + 'cn_verification_proxy': cn_proxy, + }) + url = 'http://foo.com/bar' + response = ydl.urlopen(url).read().decode('utf-8') + self.assertEqual(response, 'normal: {0}'.format(url)) + + req = compat_urllib_request.Request(url) + req.add_header('Ytdl-request-proxy', cn_proxy) + response = ydl.urlopen(req).read().decode('utf-8') + self.assertEqual(response, 'cn: {0}'.format(url)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 8f790bf0a..a8ab87685 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -24,6 +24,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + ExtractorError, find_xpath_attr, fix_xml_ampersands, InAdvancePagedList, @@ -39,6 +40,7 @@ from youtube_dl.utils import ( read_batch_urls, sanitize_filename, sanitize_path, + sanitize_url_path_consecutive_slashes, shell_quote, smuggle_url, str_to_int, @@ -53,6 +55,7 @@ from youtube_dl.utils import ( urlencode_postdata, version_tuple, xpath_with_ns, + xpath_text, render_table, match_str, ) @@ -168,6 +171,26 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + def test_sanitize_url_path_consecutive_slashes(self): + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'), + 'http://hostname/foo/bar/filename.html') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'), + 'http://hostname/foo/bar/filename.html') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname//'), + 'http://hostname/') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'), + 'http://hostname/foo/bar/filename.html') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/'), + 'http://hostname/') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/abc//'), + 'http://hostname/abc/') + def test_ordered_set(self): self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(orderedSet([]), []) @@ -229,6 +252,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') + def test_xpath_text(self): + testxml = '''<root> + <div> + <p>Foo</p> + </div> + </root>''' + doc = xml.etree.ElementTree.fromstring(testxml) + self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') + self.assertTrue(xpath_text(doc, 'div/bar') is None) + self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) + def test_smuggle_url(self): data = {"ö": "ö", "abc": [3]} url = 'https://foo.bar/baz?x=y#a' diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5a83bc956..b5ef5e009 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -328,9 +328,6 @@ class YoutubeDL(object): 'Parameter outtmpl is bytes, but should be a unicode string. ' 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') - if '%(stitle)s' in self.params.get('outtmpl', ''): - self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') - self._setup_opener() if auto_init: @@ -1218,9 +1215,6 @@ class YoutubeDL(object): if len(info_dict['title']) > 200: info_dict['title'] = info_dict['title'][:197] + '...' - # Keep for backwards compatibility - info_dict['stitle'] = info_dict['title'] - if 'format' not in info_dict: info_dict['format'] = info_dict['ext'] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e94779d40..a20492fc3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -107,6 +107,7 @@ from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE from .dotsub import DotsubIE +from .douyutv import DouyuTVIE from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE @@ -346,6 +347,7 @@ from .npo import ( ) from .nrk import ( NRKIE, + NRKPlaylistIE, NRKTVIE, ) from .ntvde import NTVDeIE @@ -381,6 +383,7 @@ from .pornhub import ( ) from .pornotube import PornotubeIE from .pornoxo import PornoXOIE +from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE @@ -537,6 +540,7 @@ from .udemy import ( UdemyIE, UdemyCourseIE ) +from .ultimedia import UltimediaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 929dd3cc5..8273bd6c9 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -146,6 +146,7 @@ class ArteTVPlus7IE(InfoExtractor): formats.append(format) + self._check_formats(formats, video_id) self._sort_formats(formats) info_dict['formats'] = formats diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index e64b88fbc..6ded723c9 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -23,7 +23,6 @@ from ..utils import ( ) from ..aes import ( aes_cbc_decrypt, - inc, ) @@ -102,13 +101,6 @@ class CrunchyrollIE(InfoExtractor): key = obfuscate_key(id) - class Counter: - __value = iv - - def next_value(self): - temp = self.__value - self.__value = inc(self.__value) - return temp decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) return zlib.decompress(decrypted_data) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py new file mode 100644 index 000000000..d7956e6e4 --- /dev/null +++ b/youtube_dl/extractor/douyutv.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class DouyuTVIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' + _TEST = { + 'url': 'http://www.douyutv.com/iseven', + 'info_dict': { + 'id': 'iseven', + 'ext': 'flv', + 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:9e525642c25a0a24302869937cf69d17', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': '7师傅', + 'uploader_id': '431925', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + 'http://www.douyutv.com/api/client/room/%s' % video_id, video_id) + + data = config['data'] + + error_code = config.get('error', 0) + show_status = data.get('show_status') + if error_code is not 0: + raise ExtractorError( + 'Server reported error %i' % error_code, expected=True) + + # 1 = live, 2 = offline + if show_status == '2': + raise ExtractorError( + 'Live stream is offline', expected=True) + + base_url = data['rtmp_url'] + live_path = data['rtmp_live'] + + title = self._live_title(data['room_name']) + description = data.get('show_details') + thumbnail = data.get('room_src') + + uploader = data.get('nickname') + uploader_id = data.get('owner_uid') + + multi_formats = data.get('rtmp_multi_bitrate') + if not isinstance(multi_formats, dict): + multi_formats = {} + multi_formats['live'] = live_path + + formats = [{ + 'url': '%s/%s' % (base_url, format_path), + 'format_id': format_id, + 'preference': 1 if format_id == 'live' else 0, + } for format_id, format_path in multi_formats.items()] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4e6927b08..8716e4503 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1006,6 +1006,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for NYTimes player + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or @@ -1268,10 +1275,16 @@ class GenericIE(InfoExtractor): # HTML5 video found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage) if not found: + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)', + r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, webpage) + if not found: + # Look also in Refresh HTTP header + refresh_header = head_response.headers.get('Refresh') + if refresh_header: + found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = found.group(1) self.report_following_redirect(new_url) diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py index 848d17beb..36ad4915c 100644 --- a/youtube_dl/extractor/grooveshark.py +++ b/youtube_dl/extractor/grooveshark.py @@ -140,9 +140,9 @@ class GroovesharkIE(InfoExtractor): if webpage is not None: o = GroovesharkHtmlParser.extract_object_tags(webpage) - return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) + return webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'] - return (webpage, None) + return webpage, None def _real_initialize(self): self.ts = int(time.time() * 1000) # timestamp in millis @@ -154,7 +154,7 @@ class GroovesharkIE(InfoExtractor): swf_referer = None if self.do_playerpage_request: (_, player_objs) = self._get_playerpage(url) - if player_objs is not None: + if player_objs: swf_referer = self._build_swf_referer(url, player_objs[0]) self.to_screen('SWF Referer: %s' % swf_referer) diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index e46954b47..96f95979a 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor): description = self._og_search_description(webpage, default=None) thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage) duration = int_or_none(flashvars.get('duration')) - width = int_or_none(self._og_search_property('video:width', webpage, 'video width')) - height = int_or_none(self._og_search_property('video:height', webpage, 'video height')) + width = int_or_none(self._og_search_property( + 'video:width', webpage, 'video width', default=None)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, 'video height', default=None)) return { 'id': video_id, diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 85eee141b..1484ac0d2 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -88,12 +88,13 @@ class LetvIE(InfoExtractor): play_json_req = compat_urllib_request.Request( 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) ) - play_json_req.add_header( - 'Ytdl-request-proxy', - self._downloader.params.get('cn_verification_proxy')) + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) + play_json = self._download_json( play_json_req, - media_id, 'playJson data') + media_id, 'Downloading playJson data') # Check for errors playstatus = play_json['playstatus'] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 1831c6749..21aea0c55 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..compat import ( @@ -10,7 +11,6 @@ from ..utils import ( ExtractorError, HEADRequest, str_to_int, - parse_iso8601, ) @@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor): 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', - 'upload_date': '20111115', - 'timestamp': 1321359578, 'thumbnail': 're:https?://.*\.jpg', 'view_count': int, 'like_count': int, @@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor): 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', - 'ext': 'm4a', - 'title': 'Electric Relaxation vol. 3', + 'ext': 'mp3', + 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', - 'uploader': 'Daniel Drumz', + 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': 're:https?://.*/images/', 'view_count': int, 'like_count': int, }, }] - def _get_url(self, track_id, template_url): - server_count = 30 - for i in range(server_count): - url = template_url % i + def _get_url(self, track_id, template_url, server_number): + boundaries = (1, 30) + for nr in server_numbers(server_number, boundaries): + url = template_url % nr try: # We only want to know if the request succeed # don't download the whole file self._request_webpage( HEADRequest(url), track_id, - 'Checking URL %d/%d ...' % (i + 1, server_count + 1)) + 'Checking URL %d/%d ...' % (nr, boundaries[-1])) return url except ExtractorError: pass - return None def _real_extract(self, url): @@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') + server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number')) template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self._get_url(track_id, template_url) + final_song_url = self._get_url(track_id, template_url, server_number) if final_song_url is None: self.to_screen('Trying with m4a extension') template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - final_song_url = self._get_url(track_id, template_url) + final_song_url = self._get_url(track_id, template_url, server_number) if final_song_url is None: raise ExtractorError('Unable to extract track url') PREFIX = ( - r'<span class="play-button[^"]*?"' + r'm-play-on-spacebar[^>]+' r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') title = self._html_search_regex( PREFIX + r'm-title="([^"]+)"', webpage, 'title') @@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor): r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) like_count = str_to_int(self._search_regex( - [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"', - r'/favorites/?">([0-9]+)<'], + r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"', webpage, 'like count', fatal=False)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', r'/listeners/?">([0-9,.]+)</a>'], webpage, 'play count', fatal=False)) - timestamp = parse_iso8601(self._search_regex( - r'<time itemprop="dateCreated" datetime="([^"]+)">', - webpage, 'upload date', default=None)) return { 'id': track_id, @@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, - 'timestamp': timestamp, 'view_count': view_count, 'like_count': like_count, } + + +def server_numbers(first, boundaries): + """ Server numbers to try in descending order of probable availability. + Starting from first (i.e. the number of the server hosting the preview file) + and going further and further up to the higher boundary and down to the + lower one in an alternating fashion. Namely: + + server_numbers(2, (1, 5)) + + # Where the preview server is 2, min number is 1 and max is 5. + # Yields: 2, 3, 1, 4, 5 + + Why not random numbers or increasing sequences? Since from what I've seen, + full length files seem to be hosted on servers whose number is closer to + that of the preview; to be confirmed. + """ + zip_longest = getattr(itertools, 'zip_longest', None) + if zip_longest is None: + # python 2.x + zip_longest = itertools.izip_longest + + if len(boundaries) != 2: + raise ValueError("boundaries should be a two-element tuple") + min, max = boundaries + highs = range(first + 1, max + 1) + lows = range(first - 1, min - 1, -1) + rest = filter( + None, itertools.chain.from_iterable(zip_longest(highs, lows))) + yield first + for n in rest: + yield n diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 7fb4e57df..ddec7b338 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -22,7 +22,7 @@ class NiconicoIE(InfoExtractor): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' - _TEST = { + _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { @@ -39,7 +39,24 @@ class NiconicoIE(InfoExtractor): 'username': 'ydl.niconico@gmail.com', 'password': 'youtube-dl', }, - } + }, { + 'url': 'http://www.nicovideo.jp/watch/nm14296458', + 'md5': '8db08e0158457cf852a31519fceea5bc', + 'info_dict': { + 'id': 'nm14296458', + 'ext': 'swf', + 'title': '【鏡音リン】Dance on media【オリジナル】take2!', + 'description': 'md5:', + 'uploader': 'りょうた', + 'uploader_id': '18822557', + 'upload_date': '20110429', + 'duration': 209, + }, + 'params': { + 'username': 'ydl.niconico@gmail.com', + 'password': 'youtube-dl', + }, + }] _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' @@ -89,7 +106,7 @@ class NiconicoIE(InfoExtractor): if self._AUTHENTICATED: # Get flv info flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') else: # Get external player info diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index bff36f9d3..e91d3a248 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,46 +14,48 @@ from ..utils import ( class NRKIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})' + _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' _TESTS = [ { - 'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/', - 'md5': 'a6eac35052f3b242bb6bb7f43aed5886', + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'bccd850baebefe23b56d708a113229c2', 'info_dict': { 'id': '150533', 'ext': 'flv', 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f' + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 263, } }, { - 'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/', - 'md5': '3471f2a51718195164e88f46bf427668', + 'url': 'http://www.nrk.no/video/PS*154915', + 'md5': '0b1493ba1aae7d9579a5ad5531bc395a', 'info_dict': { 'id': '154915', 'ext': 'flv', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, } }, ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - page = self._download_webpage(url, video_id) - - video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id') + video_id = self._match_id(url) data = self._download_json( - 'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') + 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, + video_id, 'Downloading media JSON') if data['usageRights']['isGeoBlocked']: - raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True) + raise ExtractorError( + 'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', + expected=True) + + video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' - video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' + duration = parse_duration(data.get('duration')) images = data.get('images') if images: @@ -69,10 +71,51 @@ class NRKIE(InfoExtractor): 'ext': 'flv', 'title': data['title'], 'description': data['description'], + 'duration': duration, 'thumbnail': thumbnail, } +class NRKPlaylistIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', + 'info_dict': { + 'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', + 'title': 'Gjenopplev den historiske solformørkelsen', + 'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', + 'info_dict': { + 'id': 'rivertonprisen-til-karin-fossum-1.12266449', + 'title': 'Rivertonprisen til Karin Fossum', + 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', + }, + 'playlist_count': 5, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('nrk:%s' % video_id, 'NRK') + for video_id in re.findall( + r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"', + webpage) + ] + + playlist_title = self._og_search_title(webpage) + playlist_description = self._og_search_description(webpage) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + class NRKTVIE(InfoExtractor): _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 56e1cad3b..03f0a4de6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,15 +1,17 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, +) class NYTimesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', 'md5': '18a525a510f942ada2720db5f31644c0', 'info_dict': { @@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor): 'uploader': 'Brett Weiner', 'duration': 419, } - } + }, { + 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON') + 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, + video_id, 'Downloading video JSON') title = video_data['headline'] - description = video_data['summary'] - duration = video_data['duration'] / 1000.0 + description = video_data.get('summary') + duration = float_or_none(video_data.get('duration'), 1000) uploader = video_data['byline'] timestamp = parse_iso8601(video_data['publication_date'][:-8]) @@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor): formats = [ { 'url': video['url'], - 'format_id': video['type'], - 'vcodec': video['video_codec'], - 'width': video['width'], - 'height': video['height'], - 'filesize': get_file_size(video['fileSize']), + 'format_id': video.get('type'), + 'vcodec': video.get('video_codec'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': get_file_size(video.get('fileSize')), } for video in video_data['renditions'] ] self._sort_formats(formats) @@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor): thumbnails = [ { 'url': 'http://www.nytimes.com/%s' % image['url'], - 'resolution': '%dx%d' % (image['width'], image['height']), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), } for image in video_data['images'] ] diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py new file mode 100644 index 000000000..01cc3d9ea --- /dev/null +++ b/youtube_dl/extractor/primesharetv.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ExtractorError + + +class PrimeShareTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)' + + _TEST = { + 'url': 'http://primeshare.tv/download/238790B611', + 'md5': 'b92d9bf5461137c36228009f31533fbc', + 'info_dict': { + 'id': '238790B611', + 'ext': 'mp4', + 'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>File not exist<' in webpage: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + fields = dict(re.findall(r'''(?x)<input\s+ + type="hidden"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', webpage)) + + headers = { + 'Referer': url, + 'Content-Type': 'application/x-www-form-urlencoded', + } + + wait_time = int(self._search_regex( + r'var\s+cWaitTime\s*=\s*(\d+)', + webpage, 'wait time', default=7)) + 1 + self._sleep(wait_time, video_id) + + req = compat_urllib_request.Request( + url, compat_urllib_parse.urlencode(fields), headers) + video_page = self._download_webpage( + req, video_id, 'Downloading video page') + + video_url = self._search_regex( + r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'", + video_page, 'video url') + + title = self._html_search_regex( + r'<h1>Watch\s*(?: )?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?: )?\s*<strong>', + video_page, 'title') + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': 'mp4', + } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index c04791997..11edf616a 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -4,22 +4,87 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .common import compat_str +from ..compat import ( + compat_str, + compat_urllib_request +) +from ..utils import sanitize_url_path_consecutive_slashes class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' - _TEST = { + _TESTS = [{ + 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7', + 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', 'title': 'MV:Far East Movement《The Illest》', }, - 'skip': 'Only available from China', - } + 'params': { + 'cn_verification_proxy': 'proxy.uku.im:8888' + } + }, { + 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', + 'md5': '699060e75cf58858dd47fb9c03c42cfb', + 'info_dict': { + 'id': '409385080', + 'ext': 'mp4', + 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', + } + }, { + 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', + 'md5': '9bf34be48f2f4dadcb226c74127e203c', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Multipart video', + 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', + 'info_dict': { + 'id': '78910339', + }, + 'playlist': [{ + 'md5': 'bdbfb8f39924725e6589c146bc1883ad', + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'md5': '8407e634175fdac706766481b9443450', + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] + }, { + 'note': 'Video with title containing dash', + 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', + 'info_dict': { + 'id': '78932792', + 'ext': 'mp4', + 'title': 'youtube-dl testing video', + }, + 'params': { + 'skip_download': True + } + }] def _real_extract(self, url): @@ -29,8 +94,14 @@ class SohuIE(InfoExtractor): else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + req = compat_urllib_request.Request(base_data_url + vid_id) + + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + req.add_header('Ytdl-request-proxy', cn_verification_proxy) + return self._download_json( - base_data_url + vid_id, video_id, + req, video_id, 'Downloading JSON data for %s' % vid_id) mobj = re.match(self._VALID_URL, url) @@ -38,10 +109,8 @@ class SohuIE(InfoExtractor): mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) - raw_title = self._html_search_regex( - r'(?s)<title>(.+?)</title>', - webpage, 'video title') - title = raw_title.partition('-')[0].strip() + + title = self._og_search_title(webpage) vid = self._html_search_regex( r'var vid ?= ?["\'](\d+)["\']', @@ -77,7 +146,9 @@ class SohuIE(InfoExtractor): % (format_id, i + 1, part_count)) part_info = part_str.split('|') - video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + video_url = sanitize_url_path_consecutive_slashes( + '%s%s?key=%s' % (part_info[0], su[i], part_info[3])) formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index cbdaf9c7a..aad2bf222 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -23,6 +23,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'http://usher.twitch.tv' _LOGIN_URL = 'https://secure.twitch.tv/user/login' + _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -67,14 +68,14 @@ class TwitchBaseIE(InfoExtractor): 'authenticity_token': authenticity_token, 'redirect_on_login': '', 'embed_form': 'false', - 'mp_source_action': '', + 'mp_source_action': 'login-button', 'follow': '', - 'user[login]': username, - 'user[password]': password, + 'login': username, + 'password': password, } request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py new file mode 100644 index 000000000..06554a1be --- /dev/null +++ b/youtube_dl/extractor/ultimedia.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, + clean_html, +) + + +class UltimediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' + _TESTS = [{ + # news + 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', + 'md5': '276a0e49de58c7e85d32b057837952a2', + 'info_dict': { + 'id': 's8uk0r', + 'ext': 'mp4', + 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', + 'description': 'md5:3e5c8fd65791487333dda5db8aed32af', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150317', + }, + }, { + # music + 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', + 'md5': '2ea3513813cf230605c7e2ffe7eca61c', + 'info_dict': { + 'id': 'xvpfp8', + 'ext': 'mp4', + 'title': "Two - C'est la vie (Clip)", + 'description': 'Two', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150224', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + deliver_url = self._search_regex( + r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', + webpage, 'deliver URL') + + deliver_page = self._download_webpage( + deliver_url, video_id, 'Downloading iframe page') + + if '>This video is currently not available' in deliver_page: + raise ExtractorError( + 'Video %s is currently not available' % video_id, expected=True) + + player = self._parse_json( + self._search_regex( + r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), + video_id) + + quality = qualities(['flash', 'html5']) + formats = [] + for mode in player['modes']: + video_url = mode.get('config', {}).get('file') + if not video_url: + continue + if re.match(r'https?://www\.youtube\.com/.+?', video_url): + return self.url_result(video_url, 'Youtube') + formats.append({ + 'url': video_url, + 'format_id': mode.get('type'), + 'quality': quality(mode.get('type')), + }) + self._sort_formats(formats) + + thumbnail = player.get('image') + + title = clean_html(( + self._html_search_regex( + r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', + webpage, 'title', default=None) + or self._search_regex( + r"var\s+nameVideo\s*=\s*'([^']+)'", + deliver_page, 'title'))) + + description = clean_html(self._html_search_regex( + r'(?s)<span>Description</span>(.+?)</p>', webpage, + 'description', fatal=False)) + + upload_date = unified_strdate(self._search_regex( + r'Ajouté le\s*<span>([^<]+)', webpage, + 'upload date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 273030316..eb309a7cd 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, - remove_start, -) +from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?:www\.)?videomega\.tv/ - (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+) + (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+) ''' _TEST = { - 'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ', + 'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4', 'md5': 'bf5c2f95c4c917536e80936af7bc51e1', 'info_dict': { - 'id': 'QR0HCUHI1661IHUCH0RQ', + 'id': '4GNA688SU99US886ANG4', 'ext': 'mp4', - 'title': 'Big Buck Bunny', + 'title': 'BigBuckBunny_320x180', 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id) + iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id req = compat_urllib_request.Request(iframe_url) req.add_header('Referer', url) webpage = self._download_webpage(req, video_id) - try: - escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1] - except IndexError: - raise ExtractorError('Unable to extract escaped data') - - playlist = compat_urllib_parse.unquote(escaped_data) - + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'title') + title = re.sub( + r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) - video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') - title = remove_start(self._html_search_regex( - r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - }] - self._sort_formats(formats) + r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + video_url = self._search_regex( + r'<source[^>]+?src="([^"]+)"', webpage, 'video URL') return { 'id': video_id, 'title': title, - 'formats': formats, + 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { 'Referer': iframe_url, diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0b58fe0fe..c3187cfeb 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -33,14 +33,13 @@ class VineIE(InfoExtractor): r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data')) formats = [{ - 'url': data['videoLowURL'], - 'ext': 'mp4', - 'format_id': 'low', - }, { - 'url': data['videoUrl'], - 'ext': 'mp4', - 'format_id': 'standard', - }] + 'format_id': '%(format)s-%(rate)s' % f, + 'vcodec': f['format'], + 'quality': f['rate'], + 'url': f['videoUrl'], + } for f in data['videoUrls'] if f.get('rate')] + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c3135effc..472d4df41 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -326,6 +326,13 @@ def sanitize_path(s): return os.path.join(*sanitized_path) +def sanitize_url_path_consecutive_slashes(url): + """Collapses consecutive slashes in URLs' path""" + parsed_url = list(compat_urlparse.urlparse(url)) + parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) + return compat_urlparse.urlunparse(parsed_url) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7ed07c375..51b4260aa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.03.15' +__version__ = '2015.03.18' |