diff options
28 files changed, 785 insertions, 137 deletions
| @@ -116,3 +116,4 @@ Duncan Keall  Alexander Mamay  Devin J. Pohly  Eduardo Ferro Aldama +Jeff Buchbinder diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py new file mode 100644 index 000000000..ff66449eb --- /dev/null +++ b/devscripts/generate_aes_testdata.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import codecs +import subprocess + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import intlist_to_bytes +from youtube_dl.aes import aes_encrypt, key_expansion + +secret_msg = b'Secret message goes here' + + +def hex_str(int_list): +    return codecs.encode(intlist_to_bytes(int_list), 'hex') + + +def openssl_encode(algo, key, iv): +    cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] +    prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    out, _ = prog.communicate(secret_msg) +    return out + +iv = key = [0x20, 0x15] + 14 * [0] + +r = openssl_encode('aes-128-cbc', key, iv) +print('aes_cbc_decrypt') +print(repr(r)) + +password = key +new_key = aes_encrypt(password, key_expansion(password)) +r = openssl_encode('aes-128-ctr', new_key, iv) +print('aes_decrypt_text') +print(repr(r)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d6a1e67c6..72b365305 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -112,6 +112,7 @@   - **Discovery**   - **divxstage**: DivxStage   - **Dotsub** + - **DouyuTV**   - **DRBonanza**   - **Dropbox**   - **DrTuber** @@ -342,6 +343,7 @@   - **PornHubPlaylist**   - **Pornotube**   - **PornoXO** + - **PrimeShareTV**   - **PromptFile**   - **prosiebensat1**: ProSiebenSat.1 Digital   - **Puls4** @@ -367,6 +369,7 @@   - **RTP**   - **RTS**: RTS.ch   - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil   - **rtve.es:live**: RTVE.es live streams   - **RUHD**   - **rutube**: Rutube videos @@ -487,6 +490,7 @@   - **Ubu**   - **udemy**   - **udemy:course** + - **Ultimedia**   - **Unistra**   - **Urort**: NRK P3 Urørt   - **ustream** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index db8a47d2d..652519831 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -14,6 +14,7 @@ from test.helper import FakeYDL, assertRegexpMatches  from youtube_dl import YoutubeDL  from youtube_dl.extractor import YoutubeIE  from youtube_dl.postprocessor.common import PostProcessor +from youtube_dl.utils import match_filter_func  TEST_URL = 'http://localhost/sample.mp4' @@ -339,6 +340,8 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], 'G') + +class TestYoutubeDL(unittest.TestCase):      def test_subtitles(self):          def s_formats(lang, autocaption=False):              return [{ @@ -461,6 +464,73 @@ class TestFormatSelection(unittest.TestCase):          self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)          os.unlink(audiofile) +    def test_match_filter(self): +        class FilterYDL(YDL): +            def __init__(self, *args, **kwargs): +                super(FilterYDL, self).__init__(*args, **kwargs) +                self.params['simulate'] = True + +            def process_info(self, info_dict): +                super(YDL, self).process_info(info_dict) + +            def _match_entry(self, info_dict, incomplete): +                res = super(FilterYDL, self)._match_entry(info_dict, incomplete) +                if res is None: +                    self.downloaded_info_dicts.append(info_dict) +                return res + +        first = { +            'id': '1', +            'url': TEST_URL, +            'title': 'one', +            'extractor': 'TEST', +            'duration': 30, +            'filesize': 10 * 1024, +        } +        second = { +            'id': '2', +            'url': TEST_URL, +            'title': 'two', +            'extractor': 'TEST', +            'duration': 10, +            'description': 'foo', +            'filesize': 5 * 1024, +        } +        videos = [first, second] + +        def get_videos(filter_=None): +            ydl = FilterYDL({'match_filter': filter_}) +            for v in videos: +                ydl.process_ie_result(v, download=True) +            return [v['id'] for v in ydl.downloaded_info_dicts] + +        res = get_videos() +        self.assertEqual(res, ['1', '2']) + +        def f(v): +            if v['id'] == '1': +                return None +            else: +                return 'Video id is not 1' +        res = get_videos(f) +        self.assertEqual(res, ['1']) + +        f = match_filter_func('duration < 30') +        res = get_videos(f) +        self.assertEqual(res, ['2']) + +        f = match_filter_func('description = foo') +        res = get_videos(f) +        self.assertEqual(res, ['2']) + +        f = match_filter_func('description =? foo') +        res = get_videos(f) +        self.assertEqual(res, ['1', '2']) + +        f = match_filter_func('filesize > 5KiB') +        res = get_videos(f) +        self.assertEqual(res, ['1']) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py new file mode 100644 index 000000000..111b902e1 --- /dev/null +++ b/test/test_aes.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +import base64 + +# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' + + +class TestAES(unittest.TestCase): +    def setUp(self): +        self.key = self.iv = [0x20, 0x15] + 14 * [0] +        self.secret_msg = b'Secret message goes here' + +    def test_encrypt(self): +        msg = b'message' +        key = list(range(16)) +        encrypted = aes_encrypt(bytes_to_intlist(msg), key) +        decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) +        self.assertEqual(decrypted, msg) + +    def test_cbc_decrypt(self): +        data = bytes_to_intlist( +            b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" +        ) +        decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) +        self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + +    def test_decrypt_text(self): +        password = intlist_to_bytes(self.key).decode('utf-8') +        encrypted = base64.b64encode( +            intlist_to_bytes(self.iv[:8]) + +            b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' +        ) +        decrypted = (aes_decrypt_text(encrypted, password, 16)) +        self.assertEqual(decrypted, self.secret_msg) + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_http.py b/test/test_http.py index bd4d46fef..f2e305b6f 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,7 @@ import unittest  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server +from youtube_dl.compat import compat_http_server, compat_urllib_request  import ssl  import threading @@ -68,5 +68,52 @@ class TestHTTP(unittest.TestCase):          r = ydl.extract_info('https://localhost:%d/video.html' % self.port)          self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) + +def _build_proxy_handler(name): +    class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): +        proxy_name = name + +        def log_message(self, format, *args): +            pass + +        def do_GET(self): +            self.send_response(200) +            self.send_header('Content-Type', 'text/plain; charset=utf-8') +            self.end_headers() +            self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8')) +    return HTTPTestRequestHandler + + +class TestProxy(unittest.TestCase): +    def setUp(self): +        self.proxy = compat_http_server.HTTPServer( +            ('localhost', 0), _build_proxy_handler('normal')) +        self.port = self.proxy.socket.getsockname()[1] +        self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) +        self.proxy_thread.daemon = True +        self.proxy_thread.start() + +        self.cn_proxy = compat_http_server.HTTPServer( +            ('localhost', 0), _build_proxy_handler('cn')) +        self.cn_port = self.cn_proxy.socket.getsockname()[1] +        self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) +        self.cn_proxy_thread.daemon = True +        self.cn_proxy_thread.start() + +    def test_proxy(self): +        cn_proxy = 'localhost:{0}'.format(self.cn_port) +        ydl = YoutubeDL({ +            'proxy': 'localhost:{0}'.format(self.port), +            'cn_verification_proxy': cn_proxy, +        }) +        url = 'http://foo.com/bar' +        response = ydl.urlopen(url).read().decode('utf-8') +        self.assertEqual(response, 'normal: {0}'.format(url)) + +        req = compat_urllib_request.Request(url) +        req.add_header('Ytdl-request-proxy', cn_proxy) +        response = ydl.urlopen(req).read().decode('utf-8') +        self.assertEqual(response, 'cn: {0}'.format(url)) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 8f790bf0a..a8ab87685 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -24,6 +24,7 @@ from youtube_dl.utils import (      encodeFilename,      escape_rfc3986,      escape_url, +    ExtractorError,      find_xpath_attr,      fix_xml_ampersands,      InAdvancePagedList, @@ -39,6 +40,7 @@ from youtube_dl.utils import (      read_batch_urls,      sanitize_filename,      sanitize_path, +    sanitize_url_path_consecutive_slashes,      shell_quote,      smuggle_url,      str_to_int, @@ -53,6 +55,7 @@ from youtube_dl.utils import (      urlencode_postdata,      version_tuple,      xpath_with_ns, +    xpath_text,      render_table,      match_str,  ) @@ -168,6 +171,26 @@ class TestUtil(unittest.TestCase):          self.assertEqual(sanitize_path('./abc'), 'abc')          self.assertEqual(sanitize_path('./../abc'), '..\\abc') +    def test_sanitize_url_path_consecutive_slashes(self): +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'), +            'http://hostname/foo/bar/filename.html') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'), +            'http://hostname/foo/bar/filename.html') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname//'), +            'http://hostname/') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'), +            'http://hostname/foo/bar/filename.html') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/'), +            'http://hostname/') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/abc//'), +            'http://hostname/abc/') +      def test_ordered_set(self):          self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])          self.assertEqual(orderedSet([]), []) @@ -229,6 +252,17 @@ class TestUtil(unittest.TestCase):          self.assertEqual(find('media:song/media:author').text, 'The Author')          self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') +    def test_xpath_text(self): +        testxml = '''<root> +            <div> +                <p>Foo</p> +            </div> +        </root>''' +        doc = xml.etree.ElementTree.fromstring(testxml) +        self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') +        self.assertTrue(xpath_text(doc, 'div/bar') is None) +        self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) +      def test_smuggle_url(self):          data = {"ö": "ö", "abc": [3]}          url = 'https://foo.bar/baz?x=y#a' diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5a83bc956..b5ef5e009 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -328,9 +328,6 @@ class YoutubeDL(object):                  'Parameter outtmpl is bytes, but should be a unicode string. '                  'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.') -        if '%(stitle)s' in self.params.get('outtmpl', ''): -            self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') -          self._setup_opener()          if auto_init: @@ -1218,9 +1215,6 @@ class YoutubeDL(object):          if len(info_dict['title']) > 200:              info_dict['title'] = info_dict['title'][:197] + '...' -        # Keep for backwards compatibility -        info_dict['stitle'] = info_dict['title'] -          if 'format' not in info_dict:              info_dict['format'] = info_dict['ext'] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e94779d40..a20492fc3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -107,6 +107,7 @@ from .dctp import DctpTvIE  from .deezer import DeezerPlaylistIE  from .dfb import DFBIE  from .dotsub import DotsubIE +from .douyutv import DouyuTVIE  from .dreisat import DreiSatIE  from .drbonanza import DRBonanzaIE  from .drtuber import DrTuberIE @@ -346,6 +347,7 @@ from .npo import (  )  from .nrk import (      NRKIE, +    NRKPlaylistIE,      NRKTVIE,  )  from .ntvde import NTVDeIE @@ -381,6 +383,7 @@ from .pornhub import (  )  from .pornotube import PornotubeIE  from .pornoxo import PornoXOIE +from .primesharetv import PrimeShareTVIE  from .promptfile import PromptFileIE  from .prosiebensat1 import ProSiebenSat1IE  from .puls4 import Puls4IE @@ -537,6 +540,7 @@ from .udemy import (      UdemyIE,      UdemyCourseIE  ) +from .ultimedia import UltimediaIE  from .unistra import UnistraIE  from .urort import UrortIE  from .ustream import UstreamIE, UstreamChannelIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 929dd3cc5..8273bd6c9 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -146,6 +146,7 @@ class ArteTVPlus7IE(InfoExtractor):              formats.append(format) +        self._check_formats(formats, video_id)          self._sort_formats(formats)          info_dict['formats'] = formats diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index e64b88fbc..6ded723c9 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -23,7 +23,6 @@ from ..utils import (  )  from ..aes import (      aes_cbc_decrypt, -    inc,  ) @@ -102,13 +101,6 @@ class CrunchyrollIE(InfoExtractor):          key = obfuscate_key(id) -        class Counter: -            __value = iv - -            def next_value(self): -                temp = self.__value -                self.__value = inc(self.__value) -                return temp          decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))          return zlib.decompress(decrypted_data) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py new file mode 100644 index 000000000..d7956e6e4 --- /dev/null +++ b/youtube_dl/extractor/douyutv.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class DouyuTVIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' +    _TEST = { +        'url': 'http://www.douyutv.com/iseven', +        'info_dict': { +            'id': 'iseven', +            'ext': 'flv', +            'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:9e525642c25a0a24302869937cf69d17', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': '7师傅', +            'uploader_id': '431925', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        config = self._download_json( +            'http://www.douyutv.com/api/client/room/%s' % video_id, video_id) + +        data = config['data'] + +        error_code = config.get('error', 0) +        show_status = data.get('show_status') +        if error_code is not 0: +            raise ExtractorError( +                'Server reported error %i' % error_code, expected=True) + +        # 1 = live, 2 = offline +        if show_status == '2': +            raise ExtractorError( +                'Live stream is offline', expected=True) + +        base_url = data['rtmp_url'] +        live_path = data['rtmp_live'] + +        title = self._live_title(data['room_name']) +        description = data.get('show_details') +        thumbnail = data.get('room_src') + +        uploader = data.get('nickname') +        uploader_id = data.get('owner_uid') + +        multi_formats = data.get('rtmp_multi_bitrate') +        if not isinstance(multi_formats, dict): +            multi_formats = {} +        multi_formats['live'] = live_path + +        formats = [{ +            'url': '%s/%s' % (base_url, format_path), +            'format_id': format_id, +            'preference': 1 if format_id == 'live' else 0, +        } for format_id, format_path in multi_formats.items()] +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'formats': formats, +            'is_live': True, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4e6927b08..8716e4503 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1006,6 +1006,13 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url')) +        # Look for NYTimes player +        mobj = re.search( +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url')) +          # Look for Ooyala videos          mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or @@ -1268,10 +1275,16 @@ class GenericIE(InfoExtractor):              # HTML5 video              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found: +            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' -                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)', +                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,                  webpage) +            if not found: +                # Look also in Refresh HTTP header +                refresh_header = head_response.headers.get('Refresh') +                if refresh_header: +                    found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = found.group(1)                  self.report_following_redirect(new_url) diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py index 848d17beb..36ad4915c 100644 --- a/youtube_dl/extractor/grooveshark.py +++ b/youtube_dl/extractor/grooveshark.py @@ -140,9 +140,9 @@ class GroovesharkIE(InfoExtractor):          if webpage is not None:              o = GroovesharkHtmlParser.extract_object_tags(webpage) -            return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) +            return webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'] -        return (webpage, None) +        return webpage, None      def _real_initialize(self):          self.ts = int(time.time() * 1000)  # timestamp in millis @@ -154,7 +154,7 @@ class GroovesharkIE(InfoExtractor):          swf_referer = None          if self.do_playerpage_request:              (_, player_objs) = self._get_playerpage(url) -            if player_objs is not None: +            if player_objs:                  swf_referer = self._build_swf_referer(url, player_objs[0])                  self.to_screen('SWF Referer: %s' % swf_referer) diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index e46954b47..96f95979a 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor):          description = self._og_search_description(webpage, default=None)          thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)          duration = int_or_none(flashvars.get('duration')) -        width = int_or_none(self._og_search_property('video:width', webpage, 'video width')) -        height = int_or_none(self._og_search_property('video:height', webpage, 'video height')) +        width = int_or_none(self._og_search_property( +            'video:width', webpage, 'video width', default=None)) +        height = int_or_none(self._og_search_property( +            'video:height', webpage, 'video height', default=None))          return {              'id': video_id, diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 85eee141b..1484ac0d2 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -88,12 +88,13 @@ class LetvIE(InfoExtractor):          play_json_req = compat_urllib_request.Request(              'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)          ) -        play_json_req.add_header( -            'Ytdl-request-proxy', -            self._downloader.params.get('cn_verification_proxy')) +        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') +        if cn_verification_proxy: +            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) +          play_json = self._download_json(              play_json_req, -            media_id, 'playJson data') +            media_id, 'Downloading playJson data')          # Check for errors          playstatus = play_json['playstatus'] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 1831c6749..21aea0c55 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  import re +import itertools  from .common import InfoExtractor  from ..compat import ( @@ -10,7 +11,6 @@ from ..utils import (      ExtractorError,      HEADRequest,      str_to_int, -    parse_iso8601,  ) @@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor):              'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',              'uploader': 'Daniel Holbach',              'uploader_id': 'dholbach', -            'upload_date': '20111115', -            'timestamp': 1321359578,              'thumbnail': 're:https?://.*\.jpg',              'view_count': int,              'like_count': int, @@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor):          'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',          'info_dict': {              'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', -            'ext': 'm4a', -            'title': 'Electric Relaxation vol. 3', +            'ext': 'mp3', +            'title': 'Caribou 7 inch Vinyl Mix & Chat',              'description': 'md5:2b8aec6adce69f9d41724647c65875e8', -            'uploader': 'Daniel Drumz', +            'uploader': 'Gilles Peterson Worldwide',              'uploader_id': 'gillespeterson', -            'thumbnail': 're:https?://.*\.jpg', +            'thumbnail': 're:https?://.*/images/',              'view_count': int,              'like_count': int,          },      }] -    def _get_url(self, track_id, template_url): -        server_count = 30 -        for i in range(server_count): -            url = template_url % i +    def _get_url(self, track_id, template_url, server_number): +        boundaries = (1, 30) +        for nr in server_numbers(server_number, boundaries): +            url = template_url % nr              try:                  # We only want to know if the request succeed                  # don't download the whole file                  self._request_webpage(                      HEADRequest(url), track_id, -                    'Checking URL %d/%d ...' % (i + 1, server_count + 1)) +                    'Checking URL %d/%d ...' % (nr, boundaries[-1]))                  return url              except ExtractorError:                  pass -          return None      def _real_extract(self, url): @@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor):          preview_url = self._search_regex(              r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')          song_url = preview_url.replace('/previews/', '/c/originals/') +        server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))          template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) -        final_song_url = self._get_url(track_id, template_url) +        final_song_url = self._get_url(track_id, template_url, server_number)          if final_song_url is None:              self.to_screen('Trying with m4a extension')              template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') -            final_song_url = self._get_url(track_id, template_url) +            final_song_url = self._get_url(track_id, template_url, server_number)          if final_song_url is None:              raise ExtractorError('Unable to extract track url')          PREFIX = ( -            r'<span class="play-button[^"]*?"' +            r'm-play-on-spacebar[^>]+'              r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')          title = self._html_search_regex(              PREFIX + r'm-title="([^"]+)"', webpage, 'title') @@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor):              r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)          description = self._og_search_description(webpage)          like_count = str_to_int(self._search_regex( -            [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"', -             r'/favorites/?">([0-9]+)<'], +            r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"',              webpage, 'like count', fatal=False))          view_count = str_to_int(self._search_regex(              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',               r'/listeners/?">([0-9,.]+)</a>'],              webpage, 'play count', fatal=False)) -        timestamp = parse_iso8601(self._search_regex( -            r'<time itemprop="dateCreated" datetime="([^"]+)">', -            webpage, 'upload date', default=None))          return {              'id': track_id, @@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor):              'thumbnail': thumbnail,              'uploader': uploader,              'uploader_id': uploader_id, -            'timestamp': timestamp,              'view_count': view_count,              'like_count': like_count,          } + + +def server_numbers(first, boundaries): +    """ Server numbers to try in descending order of probable availability. +    Starting from first (i.e. the number of the server hosting the preview file) +    and going further and further up to the higher boundary and down to the +    lower one in an alternating fashion. Namely: + +        server_numbers(2, (1, 5)) + +        # Where the preview server is 2, min number is 1 and max is 5. +        # Yields: 2, 3, 1, 4, 5 + +    Why not random numbers or increasing sequences? Since from what I've seen, +    full length files seem to be hosted on servers whose number is closer to +    that of the preview; to be confirmed. +    """ +    zip_longest = getattr(itertools, 'zip_longest', None) +    if zip_longest is None: +        # python 2.x +        zip_longest = itertools.izip_longest + +    if len(boundaries) != 2: +        raise ValueError("boundaries should be a two-element tuple") +    min, max = boundaries +    highs = range(first + 1, max + 1) +    lows = range(first - 1, min - 1, -1) +    rest = filter( +        None, itertools.chain.from_iterable(zip_longest(highs, lows))) +    yield first +    for n in rest: +        yield n diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 7fb4e57df..ddec7b338 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -22,7 +22,7 @@ class NiconicoIE(InfoExtractor):      IE_NAME = 'niconico'      IE_DESC = 'ニコニコ動画' -    _TEST = { +    _TESTS = [{          'url': 'http://www.nicovideo.jp/watch/sm22312215',          'md5': 'd1a75c0823e2f629128c43e1212760f9',          'info_dict': { @@ -39,7 +39,24 @@ class NiconicoIE(InfoExtractor):              'username': 'ydl.niconico@gmail.com',              'password': 'youtube-dl',          }, -    } +    }, { +        'url': 'http://www.nicovideo.jp/watch/nm14296458', +        'md5': '8db08e0158457cf852a31519fceea5bc', +        'info_dict': { +            'id': 'nm14296458', +            'ext': 'swf', +            'title': '【鏡音リン】Dance on media【オリジナル】take2!', +            'description': 'md5:', +            'uploader': 'りょうた', +            'uploader_id': '18822557', +            'upload_date': '20110429', +            'duration': 209, +        }, +        'params': { +            'username': 'ydl.niconico@gmail.com', +            'password': 'youtube-dl', +        }, +    }]      _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico' @@ -89,7 +106,7 @@ class NiconicoIE(InfoExtractor):          if self._AUTHENTICATED:              # Get flv info              flv_info_webpage = self._download_webpage( -                'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, +                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',                  video_id, 'Downloading flv info')          else:              # Get external player info diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index bff36f9d3..e91d3a248 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,46 +14,48 @@ from ..utils import (  class NRKIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})' +    _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'      _TESTS = [          { -            'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/', -            'md5': 'a6eac35052f3b242bb6bb7f43aed5886', +            'url': 'http://www.nrk.no/video/PS*150533', +            'md5': 'bccd850baebefe23b56d708a113229c2',              'info_dict': {                  'id': '150533',                  'ext': 'flv',                  'title': 'Dompap og andre fugler i Piip-Show', -                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f' +                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', +                'duration': 263,              }          },          { -            'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/', -            'md5': '3471f2a51718195164e88f46bf427668', +            'url': 'http://www.nrk.no/video/PS*154915', +            'md5': '0b1493ba1aae7d9579a5ad5531bc395a',              'info_dict': {                  'id': '154915',                  'ext': 'flv',                  'title': 'Slik høres internett ut når du er blind',                  'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', +                'duration': 20,              }          },      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - -        page = self._download_webpage(url, video_id) - -        video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id') +        video_id = self._match_id(url)          data = self._download_json( -            'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') +            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, +            video_id, 'Downloading media JSON')          if data['usageRights']['isGeoBlocked']: -            raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True) +            raise ExtractorError( +                'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', +                expected=True) + +        video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' -        video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' +        duration = parse_duration(data.get('duration'))          images = data.get('images')          if images: @@ -69,10 +71,51 @@ class NRKIE(InfoExtractor):              'ext': 'flv',              'title': data['title'],              'description': data['description'], +            'duration': duration,              'thumbnail': thumbnail,          } +class NRKPlaylistIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' + +    _TESTS = [{ +        'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', +        'info_dict': { +            'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', +            'title': 'Gjenopplev den historiske solformørkelsen', +            'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', +        }, +        'playlist_count': 2, +    }, { +        'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', +        'info_dict': { +            'id': 'rivertonprisen-til-karin-fossum-1.12266449', +            'title': 'Rivertonprisen til Karin Fossum', +            'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', +        }, +        'playlist_count': 5, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        entries = [ +            self.url_result('nrk:%s' % video_id, 'NRK') +            for video_id in re.findall( +                r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"', +                webpage) +        ] + +        playlist_title = self._og_search_title(webpage) +        playlist_description = self._og_search_description(webpage) + +        return self.playlist_result( +            entries, playlist_id, playlist_title, playlist_description) + +  class NRKTVIE(InfoExtractor):      _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 56e1cad3b..03f0a4de6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,15 +1,17 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( +    float_or_none, +    int_or_none, +    parse_iso8601, +)  class NYTimesIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)' +    _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',          'md5': '18a525a510f942ada2720db5f31644c0',          'info_dict': { @@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor):              'uploader': 'Brett Weiner',              'duration': 419,          } -    } +    }, { +        'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', +        'only_matching': True, +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          video_data = self._download_json( -            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON') +            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, +            video_id, 'Downloading video JSON')          title = video_data['headline'] -        description = video_data['summary'] -        duration = video_data['duration'] / 1000.0 +        description = video_data.get('summary') +        duration = float_or_none(video_data.get('duration'), 1000)          uploader = video_data['byline']          timestamp = parse_iso8601(video_data['publication_date'][:-8]) @@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor):          formats = [              {                  'url': video['url'], -                'format_id': video['type'], -                'vcodec': video['video_codec'], -                'width': video['width'], -                'height': video['height'], -                'filesize': get_file_size(video['fileSize']), +                'format_id': video.get('type'), +                'vcodec': video.get('video_codec'), +                'width': int_or_none(video.get('width')), +                'height': int_or_none(video.get('height')), +                'filesize': get_file_size(video.get('fileSize')),              } for video in video_data['renditions']          ]          self._sort_formats(formats) @@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor):          thumbnails = [              {                  'url': 'http://www.nytimes.com/%s' % image['url'], -                'resolution': '%dx%d' % (image['width'], image['height']), +                'width': int_or_none(image.get('width')), +                'height': int_or_none(image.get('height')),              } for image in video_data['images']          ] diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py new file mode 100644 index 000000000..01cc3d9ea --- /dev/null +++ b/youtube_dl/extractor/primesharetv.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request, +) +from ..utils import ExtractorError + + +class PrimeShareTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)' + +    _TEST = { +        'url': 'http://primeshare.tv/download/238790B611', +        'md5': 'b92d9bf5461137c36228009f31533fbc', +        'info_dict': { +            'id': '238790B611', +            'ext': 'mp4', +            'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        if '>File not exist<' in webpage: +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) + +        fields = dict(re.findall(r'''(?x)<input\s+ +            type="hidden"\s+ +            name="([^"]+)"\s+ +            (?:id="[^"]+"\s+)? +            value="([^"]*)" +            ''', webpage)) + +        headers = { +            'Referer': url, +            'Content-Type': 'application/x-www-form-urlencoded', +        } + +        wait_time = int(self._search_regex( +            r'var\s+cWaitTime\s*=\s*(\d+)', +            webpage, 'wait time', default=7)) + 1 +        self._sleep(wait_time, video_id) + +        req = compat_urllib_request.Request( +            url, compat_urllib_parse.urlencode(fields), headers) +        video_page = self._download_webpage( +            req, video_id, 'Downloading video page') + +        video_url = self._search_regex( +            r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'", +            video_page, 'video url') + +        title = self._html_search_regex( +            r'<h1>Watch\s*(?: )?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?: )?\s*<strong>', +            video_page, 'title') + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'ext': 'mp4', +        } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index c04791997..11edf616a 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -4,22 +4,87 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .common import compat_str +from ..compat import ( +    compat_str, +    compat_urllib_request +) +from ..utils import sanitize_url_path_consecutive_slashes  class SohuIE(InfoExtractor):      _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' -    _TEST = { +    _TESTS = [{ +        'note': 'This video is available only in Mainland China',          'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', -        'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7', +        'md5': '29175c8cadd8b5cc4055001e85d6b372',          'info_dict': {              'id': '382479172',              'ext': 'mp4',              'title': 'MV:Far East Movement《The Illest》',          }, -        'skip': 'Only available from China', -    } +        'params': { +            'cn_verification_proxy': 'proxy.uku.im:8888' +        } +    }, { +        'url': 'http://tv.sohu.com/20150305/n409385080.shtml', +        'md5': '699060e75cf58858dd47fb9c03c42cfb', +        'info_dict': { +            'id': '409385080', +            'ext': 'mp4', +            'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', +        } +    }, { +        'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', +        'md5': '9bf34be48f2f4dadcb226c74127e203c', +        'info_dict': { +            'id': '78693464', +            'ext': 'mp4', +            'title': '【爱范品】第31期:MWC见不到的奇葩手机', +        } +    }, { +        'note': 'Multipart video', +        'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', +        'info_dict': { +            'id': '78910339', +        }, +        'playlist': [{ +            'md5': 'bdbfb8f39924725e6589c146bc1883ad', +            'info_dict': { +                'id': '78910339_part1', +                'ext': 'mp4', +                'duration': 294, +                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', +            } +        }, { +            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', +            'info_dict': { +                'id': '78910339_part2', +                'ext': 'mp4', +                'duration': 300, +                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', +            } +        }, { +            'md5': '8407e634175fdac706766481b9443450', +            'info_dict': { +                'id': '78910339_part3', +                'ext': 'mp4', +                'duration': 150, +                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', +            } +        }] +    }, { +        'note': 'Video with title containing dash', +        'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', +        'info_dict': { +            'id': '78932792', +            'ext': 'mp4', +            'title': 'youtube-dl testing video', +        }, +        'params': { +            'skip_download': True +        } +    }]      def _real_extract(self, url): @@ -29,8 +94,14 @@ class SohuIE(InfoExtractor):              else:                  base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' +            req = compat_urllib_request.Request(base_data_url + vid_id) + +            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') +            if cn_verification_proxy: +                req.add_header('Ytdl-request-proxy', cn_verification_proxy) +              return self._download_json( -                base_data_url + vid_id, video_id, +                req, video_id,                  'Downloading JSON data for %s' % vid_id)          mobj = re.match(self._VALID_URL, url) @@ -38,10 +109,8 @@ class SohuIE(InfoExtractor):          mytv = mobj.group('mytv') is not None          webpage = self._download_webpage(url, video_id) -        raw_title = self._html_search_regex( -            r'(?s)<title>(.+?)</title>', -            webpage, 'video title') -        title = raw_title.partition('-')[0].strip() + +        title = self._og_search_title(webpage)          vid = self._html_search_regex(              r'var vid ?= ?["\'](\d+)["\']', @@ -77,7 +146,9 @@ class SohuIE(InfoExtractor):                      % (format_id, i + 1, part_count))                  part_info = part_str.split('|') -                video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + +                video_url = sanitize_url_path_consecutive_slashes( +                    '%s%s?key=%s' % (part_info[0], su[i], part_info[3]))                  formats.append({                      'url': video_url, diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index cbdaf9c7a..aad2bf222 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -23,6 +23,7 @@ class TwitchBaseIE(InfoExtractor):      _API_BASE = 'https://api.twitch.tv'      _USHER_BASE = 'http://usher.twitch.tv'      _LOGIN_URL = 'https://secure.twitch.tv/user/login' +    _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login'      _NETRC_MACHINE = 'twitch'      def _handle_error(self, response): @@ -67,14 +68,14 @@ class TwitchBaseIE(InfoExtractor):              'authenticity_token': authenticity_token,              'redirect_on_login': '',              'embed_form': 'false', -            'mp_source_action': '', +            'mp_source_action': 'login-button',              'follow': '', -            'user[login]': username, -            'user[password]': password, +            'login': username, +            'password': password,          }          request = compat_urllib_request.Request( -            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) +            self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))          request.add_header('Referer', self._LOGIN_URL)          response = self._download_webpage(              request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py new file mode 100644 index 000000000..06554a1be --- /dev/null +++ b/youtube_dl/extractor/ultimedia.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    qualities, +    unified_strdate, +    clean_html, +) + + +class UltimediaIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' +    _TESTS = [{ +        # news +        'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', +        'md5': '276a0e49de58c7e85d32b057837952a2', +        'info_dict': { +            'id': 's8uk0r', +            'ext': 'mp4', +            'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', +            'description': 'md5:3e5c8fd65791487333dda5db8aed32af', +            'thumbnail': 're:^https?://.*\.jpg', +            'upload_date': '20150317', +        }, +    }, { +        # music +        'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', +        'md5': '2ea3513813cf230605c7e2ffe7eca61c', +        'info_dict': { +            'id': 'xvpfp8', +            'ext': 'mp4', +            'title': "Two - C'est la vie (Clip)", +            'description': 'Two', +            'thumbnail': 're:^https?://.*\.jpg', +            'upload_date': '20150224', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        deliver_url = self._search_regex( +            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', +            webpage, 'deliver URL') + +        deliver_page = self._download_webpage( +            deliver_url, video_id, 'Downloading iframe page') + +        if '>This video is currently not available' in deliver_page: +            raise ExtractorError( +                'Video %s is currently not available' % video_id, expected=True) + +        player = self._parse_json( +            self._search_regex( +                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), +            video_id) + +        quality = qualities(['flash', 'html5']) +        formats = [] +        for mode in player['modes']: +            video_url = mode.get('config', {}).get('file') +            if not video_url: +                continue +            if re.match(r'https?://www\.youtube\.com/.+?', video_url): +                return self.url_result(video_url, 'Youtube') +            formats.append({ +                'url': video_url, +                'format_id': mode.get('type'), +                'quality': quality(mode.get('type')), +            }) +        self._sort_formats(formats) + +        thumbnail = player.get('image') + +        title = clean_html(( +            self._html_search_regex( +                r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', +                webpage, 'title', default=None) +            or self._search_regex( +                r"var\s+nameVideo\s*=\s*'([^']+)'", +                deliver_page, 'title'))) + +        description = clean_html(self._html_search_regex( +            r'(?s)<span>Description</span>(.+?)</p>', webpage, +            'description', fatal=False)) + +        upload_date = unified_strdate(self._search_regex( +            r'Ajouté le\s*<span>([^<]+)', webpage, +            'upload date', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 273030316..eb309a7cd 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -) -from ..utils import ( -    ExtractorError, -    remove_start, -) +from ..compat import compat_urllib_request  class VideoMegaIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://          (?:www\.)?videomega\.tv/ -        (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+) +        (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)          '''      _TEST = { -        'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ', +        'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',          'md5': 'bf5c2f95c4c917536e80936af7bc51e1',          'info_dict': { -            'id': 'QR0HCUHI1661IHUCH0RQ', +            'id': '4GNA688SU99US886ANG4',              'ext': 'mp4', -            'title': 'Big Buck Bunny', +            'title': 'BigBuckBunny_320x180',              'thumbnail': 're:^https?://.*\.jpg$',          }      } @@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id) +        iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id          req = compat_urllib_request.Request(iframe_url)          req.add_header('Referer', url)          webpage = self._download_webpage(req, video_id) -        try: -            escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1] -        except IndexError: -            raise ExtractorError('Unable to extract escaped data') - -        playlist = compat_urllib_parse.unquote(escaped_data) - +        title = self._html_search_regex( +            r'<title>(.*?)</title>', webpage, 'title') +        title = re.sub( +            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)          thumbnail = self._search_regex( -            r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) -        video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') -        title = remove_start(self._html_search_regex( -            r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ') - -        formats = [{ -            'format_id': 'sd', -            'url': video_url, -        }] -        self._sort_formats(formats) +            r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) +        video_url = self._search_regex( +            r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')          return {              'id': video_id,              'title': title, -            'formats': formats, +            'url': video_url,              'thumbnail': thumbnail,              'http_headers': {                  'Referer': iframe_url, diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0b58fe0fe..c3187cfeb 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -33,14 +33,13 @@ class VineIE(InfoExtractor):              r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))          formats = [{ -            'url': data['videoLowURL'], -            'ext': 'mp4', -            'format_id': 'low', -        }, { -            'url': data['videoUrl'], -            'ext': 'mp4', -            'format_id': 'standard', -        }] +            'format_id': '%(format)s-%(rate)s' % f, +            'vcodec': f['format'], +            'quality': f['rate'], +            'url': f['videoUrl'], +        } for f in data['videoUrls'] if f.get('rate')] + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c3135effc..472d4df41 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -326,6 +326,13 @@ def sanitize_path(s):      return os.path.join(*sanitized_path) +def sanitize_url_path_consecutive_slashes(url): +    """Collapses consecutive slashes in URLs' path""" +    parsed_url = list(compat_urlparse.urlparse(url)) +    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) +    return compat_urlparse.urlunparse(parsed_url) + +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7ed07c375..51b4260aa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.03.15' +__version__ = '2015.03.18' | 
