diff options
47 files changed, 2122 insertions, 548 deletions
| diff --git a/.gitignore b/.gitignore index 61cb6bc3c..24fdb3626 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ updates_key.pem  *.flv  *.mp4  *.part +test/testdata @@ -19,7 +19,8 @@ which means you can modify it, redistribute it or use it however you like.      -U, --update               update this program to latest version. Make sure                                 that you have sufficient permissions (run with                                 sudo if needed) -    -i, --ignore-errors        continue on download errors +    -i, --ignore-errors        continue on download errors, for example to to +                               skip unavailable videos in a playlist      --dump-user-agent          display the current browser identification      --user-agent UA            specify a custom user agent      --referer REF              specify a custom referer, use if the video access @@ -29,6 +30,10 @@ which means you can modify it, redistribute it or use it however you like.      --extractor-descriptions   Output descriptions of all supported extractors      --proxy URL                Use the specified HTTP/HTTPS proxy      --no-check-certificate     Suppress HTTPS certificate validation. +    --cache-dir None           Location in the filesystem where youtube-dl can +                               store downloaded information permanently. +                               ~/.youtube-dl/cache by default +    --no-cache-dir             Disable filesystem caching  ## Video Selection:      --playlist-start NUMBER    playlist video to start at (default is 1) diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py new file mode 100644 index 000000000..e0c3cc83e --- /dev/null +++ b/devscripts/buildserver.py @@ -0,0 +1,405 @@ +#!/usr/bin/python3 + +from http.server import HTTPServer, BaseHTTPRequestHandler +from socketserver import ThreadingMixIn +import argparse +import ctypes +import functools +import sys +import threading +import traceback +import os.path + + +class BuildHTTPServer(ThreadingMixIn, HTTPServer): +    allow_reuse_address = True + + +advapi32 = ctypes.windll.advapi32 + +SC_MANAGER_ALL_ACCESS = 0xf003f +SC_MANAGER_CREATE_SERVICE = 0x02 +SERVICE_WIN32_OWN_PROCESS = 0x10 +SERVICE_AUTO_START = 0x2 +SERVICE_ERROR_NORMAL = 0x1 +DELETE = 0x00010000 +SERVICE_STATUS_START_PENDING = 0x00000002 +SERVICE_STATUS_RUNNING = 0x00000004 +SERVICE_ACCEPT_STOP = 0x1 + +SVCNAME = 'youtubedl_builder' + +LPTSTR = ctypes.c_wchar_p +START_CALLBACK = ctypes.WINFUNCTYPE(None, ctypes.c_int, ctypes.POINTER(LPTSTR)) + + +class SERVICE_TABLE_ENTRY(ctypes.Structure): +    _fields_ = [ +        ('lpServiceName', LPTSTR), +        ('lpServiceProc', START_CALLBACK) +    ] + + +HandlerEx = ctypes.WINFUNCTYPE( +    ctypes.c_int,     # return +    ctypes.c_int,     # dwControl +    ctypes.c_int,     # dwEventType +    ctypes.c_void_p,  # lpEventData, +    ctypes.c_void_p,  # lpContext, +) + + +def _ctypes_array(c_type, py_array): +    ar = (c_type * len(py_array))() +    ar[:] = py_array +    return ar + + +def win_OpenSCManager(): +    res = advapi32.OpenSCManagerW(None, None, SC_MANAGER_ALL_ACCESS) +    if not res: +        raise Exception('Opening service manager failed - ' +                        'are you running this as administrator?') +    return res + + +def win_install_service(service_name, cmdline): +    manager = win_OpenSCManager() +    try: +        h = advapi32.CreateServiceW( +            manager, service_name, None, +            SC_MANAGER_CREATE_SERVICE, SERVICE_WIN32_OWN_PROCESS, +            SERVICE_AUTO_START, SERVICE_ERROR_NORMAL, +            cmdline, None, None, None, None, None) +        if not h: +            raise OSError('Service creation failed: %s' % ctypes.FormatError()) + +        advapi32.CloseServiceHandle(h) +    finally: +        advapi32.CloseServiceHandle(manager) + + +def win_uninstall_service(service_name): +    manager = win_OpenSCManager() +    try: +        h = advapi32.OpenServiceW(manager, service_name, DELETE) +        if not h: +            raise OSError('Could not find service %s: %s' % ( +                service_name, ctypes.FormatError())) + +        try: +            if not advapi32.DeleteService(h): +                raise OSError('Deletion failed: %s' % ctypes.FormatError()) +        finally: +            advapi32.CloseServiceHandle(h) +    finally: +        advapi32.CloseServiceHandle(manager) + + +def win_service_report_event(service_name, msg, is_error=True): +    with open('C:/sshkeys/log', 'a', encoding='utf-8') as f: +        f.write(msg + '\n') + +    event_log = advapi32.RegisterEventSourceW(None, service_name) +    if not event_log: +        raise OSError('Could not report event: %s' % ctypes.FormatError()) + +    try: +        type_id = 0x0001 if is_error else 0x0004 +        event_id = 0xc0000000 if is_error else 0x40000000 +        lines = _ctypes_array(LPTSTR, [msg]) + +        if not advapi32.ReportEventW( +                event_log, type_id, 0, event_id, None, len(lines), 0, +                lines, None): +            raise OSError('Event reporting failed: %s' % ctypes.FormatError()) +    finally: +        advapi32.DeregisterEventSource(event_log) + + +def win_service_handler(stop_event, *args): +    try: +        raise ValueError('Handler called with args ' + repr(args)) +        TODO +    except Exception as e: +        tb = traceback.format_exc() +        msg = str(e) + '\n' + tb +        win_service_report_event(service_name, msg, is_error=True) +        raise + + +def win_service_set_status(handle, status_code): +    svcStatus = SERVICE_STATUS() +    svcStatus.dwServiceType = SERVICE_WIN32_OWN_PROCESS +    svcStatus.dwCurrentState = status_code +    svcStatus.dwControlsAccepted = SERVICE_ACCEPT_STOP + +    svcStatus.dwServiceSpecificExitCode = 0 + +    if not advapi32.SetServiceStatus(handle, ctypes.byref(svcStatus)): +        raise OSError('SetServiceStatus failed: %r' % ctypes.FormatError()) + + +def win_service_main(service_name, real_main, argc, argv_raw): +    try: +        #args = [argv_raw[i].value for i in range(argc)] +        stop_event = threading.Event() +        handler = HandlerEx(functools.partial(stop_event, win_service_handler)) +        h = advapi32.RegisterServiceCtrlHandlerExW(service_name, handler, None) +        if not h: +            raise OSError('Handler registration failed: %s' % +                          ctypes.FormatError()) + +        TODO +    except Exception as e: +        tb = traceback.format_exc() +        msg = str(e) + '\n' + tb +        win_service_report_event(service_name, msg, is_error=True) +        raise + + +def win_service_start(service_name, real_main): +    try: +        cb = START_CALLBACK( +            functools.partial(win_service_main, service_name, real_main)) +        dispatch_table = _ctypes_array(SERVICE_TABLE_ENTRY, [ +            SERVICE_TABLE_ENTRY( +                service_name, +                cb +            ), +            SERVICE_TABLE_ENTRY(None, ctypes.cast(None, START_CALLBACK)) +        ]) + +        if not advapi32.StartServiceCtrlDispatcherW(dispatch_table): +            raise OSError('ctypes start failed: %s' % ctypes.FormatError()) +    except Exception as e: +        tb = traceback.format_exc() +        msg = str(e) + '\n' + tb +        win_service_report_event(service_name, msg, is_error=True) +        raise + + +def main(args=None): +    parser = argparse.ArgumentParser() +    parser.add_argument('-i', '--install', +                        action='store_const', dest='action', const='install', +                        help='Launch at Windows startup') +    parser.add_argument('-u', '--uninstall', +                        action='store_const', dest='action', const='uninstall', +                        help='Remove Windows service') +    parser.add_argument('-s', '--service', +                        action='store_const', dest='action', const='service', +                        help='Run as a Windows service') +    parser.add_argument('-b', '--bind', metavar='<host:port>', +                        action='store', default='localhost:8142', +                        help='Bind to host:port (default %default)') +    options = parser.parse_args(args=args) + +    if options.action == 'install': +        fn = os.path.abspath(__file__).replace('v:', '\\\\vboxsrv\\vbox') +        cmdline = '%s %s -s -b %s' % (sys.executable, fn, options.bind) +        win_install_service(SVCNAME, cmdline) +        return + +    if options.action == 'uninstall': +        win_uninstall_service(SVCNAME) +        return + +    if options.action == 'service': +        win_service_start(SVCNAME, main) +        return + +    host, port_str = options.bind.split(':') +    port = int(port_str) + +    print('Listening on %s:%d' % (host, port)) +    srv = BuildHTTPServer((host, port), BuildHTTPRequestHandler) +    thr = threading.Thread(target=srv.serve_forever) +    thr.start() +    input('Press ENTER to shut down') +    srv.shutdown() +    thr.join() + + +def rmtree(path): +    for name in os.listdir(path): +        fname = os.path.join(path, name) +        if os.path.isdir(fname): +            rmtree(fname) +        else: +            os.chmod(fname, 0o666) +            os.remove(fname) +    os.rmdir(path) + +#============================================================================== + +class BuildError(Exception): +    def __init__(self, output, code=500): +        self.output = output +        self.code = code + +    def __str__(self): +        return self.output + + +class HTTPError(BuildError): +    pass + + +class PythonBuilder(object): +    def __init__(self, **kwargs): +        pythonVersion = kwargs.pop('python', '2.7') +        try: +            key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\Python\PythonCore\%s\InstallPath' % pythonVersion) +            try: +                self.pythonPath, _ = _winreg.QueryValueEx(key, '') +            finally: +                _winreg.CloseKey(key) +        except Exception: +            raise BuildError('No such Python version: %s' % pythonVersion) + +        super(PythonBuilder, self).__init__(**kwargs) + + +class GITInfoBuilder(object): +    def __init__(self, **kwargs): +        try: +            self.user, self.repoName = kwargs['path'][:2] +            self.rev = kwargs.pop('rev') +        except ValueError: +            raise BuildError('Invalid path') +        except KeyError as e: +            raise BuildError('Missing mandatory parameter "%s"' % e.args[0]) + +        path = os.path.join(os.environ['APPDATA'], 'Build archive', self.repoName, self.user) +        if not os.path.exists(path): +            os.makedirs(path) +        self.basePath = tempfile.mkdtemp(dir=path) +        self.buildPath = os.path.join(self.basePath, 'build') + +        super(GITInfoBuilder, self).__init__(**kwargs) + + +class GITBuilder(GITInfoBuilder): +    def build(self): +        try: +            subprocess.check_output(['git', 'clone', 'git://github.com/%s/%s.git' % (self.user, self.repoName), self.buildPath]) +            subprocess.check_output(['git', 'checkout', self.rev], cwd=self.buildPath) +        except subprocess.CalledProcessError as e: +            raise BuildError(e.output) + +        super(GITBuilder, self).build() + + +class YoutubeDLBuilder(object): +    authorizedUsers = ['fraca7', 'phihag', 'rg3', 'FiloSottile'] + +    def __init__(self, **kwargs): +        if self.repoName != 'youtube-dl': +            raise BuildError('Invalid repository "%s"' % self.repoName) +        if self.user not in self.authorizedUsers: +            raise HTTPError('Unauthorized user "%s"' % self.user, 401) + +        super(YoutubeDLBuilder, self).__init__(**kwargs) + +    def build(self): +        try: +            subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'], +                                    cwd=self.buildPath) +        except subprocess.CalledProcessError as e: +            raise BuildError(e.output) + +        super(YoutubeDLBuilder, self).build() + + +class DownloadBuilder(object): +    def __init__(self, **kwargs): +        self.handler = kwargs.pop('handler') +        self.srcPath = os.path.join(self.buildPath, *tuple(kwargs['path'][2:])) +        self.srcPath = os.path.abspath(os.path.normpath(self.srcPath)) +        if not self.srcPath.startswith(self.buildPath): +            raise HTTPError(self.srcPath, 401) + +        super(DownloadBuilder, self).__init__(**kwargs) + +    def build(self): +        if not os.path.exists(self.srcPath): +            raise HTTPError('No such file', 404) +        if os.path.isdir(self.srcPath): +            raise HTTPError('Is a directory: %s' % self.srcPath, 401) + +        self.handler.send_response(200) +        self.handler.send_header('Content-Type', 'application/octet-stream') +        self.handler.send_header('Content-Disposition', 'attachment; filename=%s' % os.path.split(self.srcPath)[-1]) +        self.handler.send_header('Content-Length', str(os.stat(self.srcPath).st_size)) +        self.handler.end_headers() + +        with open(self.srcPath, 'rb') as src: +            shutil.copyfileobj(src, self.handler.wfile) + +        super(DownloadBuilder, self).build() + + +class CleanupTempDir(object): +    def build(self): +        try: +            rmtree(self.basePath) +        except Exception as e: +            print('WARNING deleting "%s": %s' % (self.basePath, e)) + +        super(CleanupTempDir, self).build() + + +class Null(object): +    def __init__(self, **kwargs): +        pass + +    def start(self): +        pass + +    def close(self): +        pass + +    def build(self): +        pass + + +class Builder(PythonBuilder, GITBuilder, YoutubeDLBuilder, DownloadBuilder, CleanupTempDir, Null): +    pass + + +class BuildHTTPRequestHandler(BaseHTTPRequestHandler): +    actionDict = { 'build': Builder, 'download': Builder } # They're the same, no more caching. + +    def do_GET(self): +        path = urlparse.urlparse(self.path) +        paramDict = dict([(key, value[0]) for key, value in urlparse.parse_qs(path.query).items()]) +        action, _, path = path.path.strip('/').partition('/') +        if path: +            path = path.split('/') +            if action in self.actionDict: +                try: +                    builder = self.actionDict[action](path=path, handler=self, **paramDict) +                    builder.start() +                    try: +                        builder.build() +                    finally: +                        builder.close() +                except BuildError as e: +                    self.send_response(e.code) +                    msg = unicode(e).encode('UTF-8') +                    self.send_header('Content-Type', 'text/plain; charset=UTF-8') +                    self.send_header('Content-Length', len(msg)) +                    self.end_headers() +                    self.wfile.write(msg) +                except HTTPError as e: +                    self.send_response(e.code, str(e)) +            else: +                self.send_response(500, 'Unknown build method "%s"' % action) +        else: +            self.send_response(500, 'Malformed URL') + +#============================================================================== + +if __name__ == '__main__': +    main() diff --git a/devscripts/release.sh b/devscripts/release.sh index 62c68a6cf..796468b4b 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -55,8 +55,8 @@ git push origin "$version"  /bin/echo -e "\n### OK, now it is time to build the binaries..."  REV=$(git rev-parse HEAD)  make youtube-dl youtube-dl.tar.gz -wget "http://jeromelaheurte.net:8142/download/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe || \ -	wget "http://jeromelaheurte.net:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe +read -p "VM running? (y/n) " -n 1 +wget "http://localhost:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe  mkdir -p "build/$version"  mv youtube-dl youtube-dl.exe "build/$version"  mv youtube-dl.tar.gz "build/$version/youtube-dl-$version.tar.gz" diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py deleted file mode 100644 index b390c7e2e..000000000 --- a/devscripts/youtube_genalgo.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python - -# Generate youtube signature algorithm from test cases - -import sys - -tests = [ -    # 92 - vflQw-fB4 2013/07/17 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"", -     "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"), -    # 90 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", -     "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), -    # 89  -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", -     "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), -    # 88 - vflapUV9V 2013/08/28 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", -     "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"), -    # 87 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", -     "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), -    # 86 - vfluy6kdb 2013/09/06 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", -     "yuioplkjhgfdsazxcvbnm12345678q0QWrRTYUIOELKJHGFD-AZXCVBNM!@#$%^&*()_<+={[|};?/>.S"), -    # 85 - vflkuzxcs 2013/09/11 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", -     "T>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOvUY.REWQ0987654321mnbqcxzasdfghjklpoiuytr"), -    # 84 - vflg0g8PQ 2013/08/29 (sporadic) -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", -     ">?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWq0987654321mnbvcxzasdfghjklpoiuytr"), -    # 83 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", -     ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), -    # 82 - vflGNjMhJ 2013/09/12 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", -     ".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"), -    # 81 - vflLC8JvQ 2013/07/25 -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", -     "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), -    # 80 - vflZK4ZYR 2013/08/23 (sporadic) -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>", -     "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>"), -    # 79 - vflLC8JvQ 2013/07/25 (sporadic) -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", -     "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), -] - -tests_age_gate = [ -    # 86 - vflqinMWD -    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", -     "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), -] - -def find_matching(wrong, right): -    idxs = [wrong.index(c) for c in right] -    return compress(idxs) -    return ('s[%d]' % i for i in idxs) - -def compress(idxs): -    def _genslice(start, end, step): -        starts = '' if start == 0 else str(start) -        ends = ':%d' % (end+step) -        steps = '' if step == 1 else (':%d' % step) -        return 's[%s%s%s]' % (starts, ends, steps) - -    step = None -    for i, prev in zip(idxs[1:], idxs[:-1]): -        if step is not None: -            if i - prev == step: -                continue -            yield _genslice(start, prev, step) -            step = None -            continue -        if i - prev in [-1, 1]: -            step = i - prev -            start = prev -            continue -        else: -            yield 's[%d]' % prev -    if step is None: -        yield 's[%d]' % i -    else: -        yield _genslice(start, i, step) - -def _assert_compress(inp, exp): -    res = list(compress(inp)) -    if res != exp: -        print('Got %r, expected %r' % (res, exp)) -        assert res == exp -_assert_compress([0,2,4,6], ['s[0]', 's[2]', 's[4]', 's[6]']) -_assert_compress([0,1,2,4,6,7], ['s[:3]', 's[4]', 's[6:8]']) -_assert_compress([8,0,1,2,4,7,6,9], ['s[8]', 's[:3]', 's[4]', 's[7:5:-1]', 's[9]']) - -def gen(wrong, right, indent): -    code = ' + '.join(find_matching(wrong, right)) -    return 'if len(s) == %d:\n%s    return %s\n' % (len(wrong), indent, code) - -def genall(tests): -    indent = ' ' * 8 -    return indent + (indent + 'el').join(gen(wrong, right, indent) for wrong,right in tests) - -def main(): -    print(genall(tests)) -    print(u'    Age gate:') -    print(genall(tests_age_gate)) - -if __name__ == '__main__': -    main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 99fc7bd28..ff1c86efe 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668          self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])          self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) +        self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])      def test_youtube_channel_matching(self):          assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index bcd9f79f6..83c65d57e 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')      def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 5) @@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase):          self.assertTrue(len(subtitles.keys()) == 0)      def test_nosubtitles(self):          self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) diff --git a/test/test_playlists.py b/test/test_playlists.py index 4a2e00b01..c33511333 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -1,4 +1,5 @@  #!/usr/bin/env python +# encoding: utf-8  import sys  import unittest @@ -8,7 +9,14 @@ import json  import os  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE +from youtube_dl.extractor import ( +    DailymotionPlaylistIE, +    DailymotionUserIE, +    VimeoChannelIE, +    UstreamChannelIE, +    SoundcloudUserIE, +    LivestreamIE, +)  from youtube_dl.utils import *  from helper import FakeYDL @@ -26,6 +34,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['title'], u'SPORT')          self.assertTrue(len(result['entries']) > 20) +    def test_dailymotion_user(self): +        dl = FakeYDL() +        ie = DailymotionUserIE(dl) +        result = ie.extract('http://www.dailymotion.com/user/generation-quoi/') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'Génération Quoi') +        self.assertTrue(len(result['entries']) >= 26) +      def test_vimeo_channel(self):          dl = FakeYDL()          ie = VimeoChannelIE(dl) @@ -42,5 +58,21 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], u'5124905')          self.assertTrue(len(result['entries']) >= 11) +    def test_soundcloud_user(self): +        dl = FakeYDL() +        ie = SoundcloudUserIE(dl) +        result = ie.extract('https://soundcloud.com/the-concept-band') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], u'9615865') +        self.assertTrue(len(result['entries']) >= 12) + +    def test_livestream_event(self): +        dl = FakeYDL() +        ie = LivestreamIE(dl) +        result = ie.extract('http://new.livestream.com/tedx/cityenglish') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'TEDCity2.0 (English)') +        self.assertTrue(len(result['entries']) >= 4) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py new file mode 100644 index 000000000..5007d9a16 --- /dev/null +++ b/test/test_youtube_signature.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +import io +import re +import string +import sys +import unittest + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import YoutubeIE +from youtube_dl.utils import compat_str, compat_urlretrieve + +_TESTS = [ +    ( +        u'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', +        u'js', +        86, +        u'>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', +    ), +    ( +        u'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', +        u'js', +        85, +        u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', +    ), +    ( +        u'https://s.ytimg.com/yts/swfbin/watch_as3-vflg5GhxU.swf', +        u'swf', +        82, +        u':/.-,+*)=\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBAzyxw>utsrqponmlkjihgfedcba987654321' +    ), +] + + +class TestSignature(unittest.TestCase): +    def setUp(self): +        TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +        self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') +        if not os.path.exists(self.TESTDATA_DIR): +            os.mkdir(self.TESTDATA_DIR) + + +def make_tfunc(url, stype, sig_length, expected_sig): +    basename = url.rpartition('/')[2] +    m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) +    assert m, '%r should follow URL format' % basename +    test_id = m.group(1) + +    def test_func(self): +        fn = os.path.join(self.TESTDATA_DIR, basename) + +        if not os.path.exists(fn): +            compat_urlretrieve(url, fn) + +        ie = YoutubeIE() +        if stype == 'js': +            with io.open(fn, encoding='utf-8') as testf: +                jscode = testf.read() +            func = ie._parse_sig_js(jscode) +        else: +            assert stype == 'swf' +            with open(fn, 'rb') as testf: +                swfcode = testf.read() +            func = ie._parse_sig_swf(swfcode) +        src_sig = compat_str(string.printable[:sig_length]) +        got_sig = func(src_sig) +        self.assertEqual(got_sig, expected_sig) + +    test_func.__name__ = str('test_signature_' + stype + '_' + test_id) +    setattr(TestSignature, test_func.__name__, test_func) + +for test_spec in _TESTS: +    make_tfunc(*test_spec) + + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 5632871ac..168e6c66c 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')      def test_youtube_allsubtitles(self): +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 13) @@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          self.assertTrue(subtitles['it'] is not None)      def test_youtube_nosubtitles(self):          self.url = 'sAjKT8FhjI8' +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 0b5a5d77d..d6673fd3a 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -77,26 +77,43 @@ class FileDownloader(object):      @staticmethod      def calc_percent(byte_counter, data_len):          if data_len is None: +            return None +        return float(byte_counter) / float(data_len) * 100.0 + +    @staticmethod +    def format_percent(percent): +        if percent is None:              return '---.-%' -        return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) +        return '%6s' % ('%3.1f%%' % percent)      @staticmethod      def calc_eta(start, now, total, current):          if total is None: -            return '--:--' +            return None          dif = now - start          if current == 0 or dif < 0.001: # One millisecond -            return '--:--' +            return None          rate = float(current) / dif -        eta = int((float(total) - float(current)) / rate) +        return int((float(total) - float(current)) / rate) + +    @staticmethod +    def format_eta(eta): +        if eta is None: +            return '--:--'          return FileDownloader.format_seconds(eta)      @staticmethod      def calc_speed(start, now, bytes):          dif = now - start          if bytes == 0 or dif < 0.001: # One millisecond +            return None +        return float(bytes) / dif + +    @staticmethod +    def format_speed(speed): +        if speed is None:              return '%10s' % '---b/s' -        return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) +        return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))      @staticmethod      def best_block_size(elapsed_time, bytes): @@ -205,11 +222,14 @@ class FileDownloader(object):          """Report destination filename."""          self.to_screen(u'[download] Destination: ' + filename) -    def report_progress(self, percent_str, data_len_str, speed_str, eta_str): +    def report_progress(self, percent, data_len_str, speed, eta):          """Report download progress."""          if self.params.get('noprogress', False):              return          clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') +        eta_str = self.format_eta(eta) +        percent_str = self.format_percent(percent) +        speed_str = self.format_speed(speed)          if self.params.get('progress_with_newline', False):              self.to_screen(u'[download] %s of %s at %s ETA %s' %                  (percent_str, data_len_str, speed_str, eta_str)) @@ -378,6 +398,7 @@ class FileDownloader(object):              self._hook_progress({                  'filename': filename,                  'status': 'finished', +                'total_bytes': os.path.getsize(encodeFilename(filename)),              })              return True @@ -524,13 +545,14 @@ class FileDownloader(object):                  block_size = self.best_block_size(after - before, len(data_block))              # Progress message -            speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) +            speed = self.calc_speed(start, time.time(), byte_counter - resume_len)              if data_len is None:                  self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') +                eta = None              else: -                percent_str = self.calc_percent(byte_counter, data_len) -                eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) -                self.report_progress(percent_str, data_len_str, speed_str, eta_str) +                percent = self.calc_percent(byte_counter, data_len) +                eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) +                self.report_progress(percent, data_len_str, speed, eta)              self._hook_progress({                  'downloaded_bytes': byte_counter, @@ -538,6 +560,8 @@ class FileDownloader(object):                  'tmpfilename': tmpfilename,                  'filename': filename,                  'status': 'downloading', +                'eta': eta, +                'speed': speed,              })              # Apply rate limit @@ -580,6 +604,8 @@ class FileDownloader(object):          * downloaded_bytes: Bytes on disks          * total_bytes: Total bytes, None if unknown          * tmpfilename: The filename we're currently writing to +        * eta: The estimated time in seconds, None if unknown +        * speed: The download speed in bytes/second, None if unknown          Hooks are guaranteed to be called at least once (with status "finished")          if the download is successful. diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index ae56d2082..3ee1d3c58 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -444,8 +444,11 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):          if information['ext'] != u'mp4':              self._downloader.to_screen(u'[ffmpeg] Subtitles can only be embedded in mp4 files')              return True, information -        sub_langs = [key for key in information['subtitles']] +        if not information.get('subtitles'): +            self._downloader.to_screen(u'[ffmpeg] There aren\'t any subtitles to embed')  +            return True, information +        sub_langs = [key for key in information['subtitles']]          filename = information['filepath']          input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c2f992b8e..44a272e7e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -74,12 +74,15 @@ class YoutubeDL(object):      writesubtitles:    Write the video subtitles to a file      writeautomaticsub: Write the automatic subtitles to a file      allsubtitles:      Downloads all the subtitles of the video +                       (requires writesubtitles or writeautomaticsub)      listsubtitles:     Lists all available subtitles for the video      subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)      subtitleslangs:    List of languages of the subtitles to download      keepvideo:         Keep the video file after post-processing      daterange:         A DateRange object, download only if the upload_date is in the range.      skip_download:     Skip the actual download of the video file +    cachedir:          Location of the cache files in the filesystem. +                       None to disable filesystem cache.      The following parameters are not used by YoutubeDL itself, they are used by      the FileDownloader: @@ -103,6 +106,17 @@ class YoutubeDL(object):          self._download_retcode = 0          self._num_downloads = 0          self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + +        if (sys.version_info >= (3,) and sys.platform != 'win32' and +                sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] +                and not params['restrictfilenames']): +            # On Python 3, the Unicode filesystem API will throw errors (#1474) +            self.report_warning( +                u'Assuming --restrict-filenames isnce file system encoding ' +                u'cannot encode all charactes. ' +                u'Set the LC_ALL environment variable to fix this.') +            params['restrictfilenames'] = True +          self.params = params          self.fd = FileDownloader(self, self.params) @@ -141,14 +155,10 @@ class YoutubeDL(object):      def to_screen(self, message, skip_eol=False):          """Print message to stdout if not in quiet mode.""" -        assert type(message) == type(u'')          if not self.params.get('quiet', False):              terminator = [u'\n', u''][skip_eol]              output = message + terminator -            if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr -                output = output.encode(preferredencoding(), 'ignore') -            self._screen_file.write(output) -            self._screen_file.flush() +            write_string(output, self._screen_file)      def to_stderr(self, message):          """Print message to stderr.""" @@ -499,8 +509,7 @@ class YoutubeDL(object):                  return          subtitles_are_requested = any([self.params.get('writesubtitles', False), -                                       self.params.get('writeautomaticsub'), -                                       self.params.get('allsubtitles', False)]) +                                       self.params.get('writeautomaticsub')])          if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:              # subtitles download errors are already managed as troubles in relevant IE @@ -536,11 +545,15 @@ class YoutubeDL(object):                  thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format                  self.to_screen(u'[%s] %s: Downloading thumbnail ...' %                                 (info_dict['extractor'], info_dict['id'])) -                uf = compat_urllib_request.urlopen(info_dict['thumbnail']) -                with open(thumb_filename, 'wb') as thumbf: -                    shutil.copyfileobj(uf, thumbf) -                self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % -                               (info_dict['extractor'], info_dict['id'], thumb_filename)) +                try: +                    uf = compat_urllib_request.urlopen(info_dict['thumbnail']) +                    with open(thumb_filename, 'wb') as thumbf: +                        shutil.copyfileobj(uf, thumbf) +                    self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % +                        (info_dict['extractor'], info_dict['id'], thumb_filename)) +                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +                    self.report_warning(u'Unable to download thumbnail "%s": %s' % +                        (info_dict['thumbnail'], compat_str(err)))          if not self.params.get('skip_download', False):              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): @@ -548,11 +561,11 @@ class YoutubeDL(object):              else:                  try:                      success = self.fd._do_download(filename, info_dict) -                except (OSError, IOError) as err: -                    raise UnavailableVideoError(err)                  except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:                      self.report_error(u'unable to download video data: %s' % str(err))                      return +                except (OSError, IOError) as err: +                    raise UnavailableVideoError(err)                  except (ContentTooShortError, ) as err:                      self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))                      return diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 696e54f49..28a7bdd92 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -30,6 +30,7 @@ __authors__  = (      'Pierre Rudloff',      'Huarong Huo',      'Ismael Mejía', +    'Steffan \'Ruirize\' James',  )  __license__ = 'Public Domain' @@ -149,7 +150,7 @@ def parseOpts(overrideArguments=None):      general.add_option('-U', '--update',              action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')      general.add_option('-i', '--ignore-errors', -            action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) +            action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False)      general.add_option('--dump-user-agent',              action='store_true', dest='dump_user_agent',              help='display the current browser identification', default=False) @@ -166,6 +167,12 @@ def parseOpts(overrideArguments=None):              help='Output descriptions of all supported extractors', default=False)      general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')      general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') +    general.add_option( +        '--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', +        help='Location in the filesystem where youtube-dl can store downloaded information permanently. %default by default') +    general.add_option( +        '--no-cache-dir', action='store_const', const=None, dest='cachedir', +        help='Disable filesystem caching')      selection.add_option('--playlist-start', @@ -271,6 +278,10 @@ def parseOpts(overrideArguments=None):      verbosity.add_option('--dump-intermediate-pages',              action='store_true', dest='dump_intermediate_pages', default=False,              help='print downloaded pages to debug problems(very verbose)') +    verbosity.add_option('--youtube-print-sig-code', +            action='store_true', dest='youtube_print_sig_code', default=False, +            help=optparse.SUPPRESS_HELP) +      filesystem.add_option('-t', '--title',              action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -354,7 +365,7 @@ def parseOpts(overrideArguments=None):      if overrideArguments is not None:          opts, args = parser.parse_args(overrideArguments)          if opts.verbose: -            sys.stderr.write(u'[debug] Override config: ' + repr(overrideArguments) + '\n') +            write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')      else:          xdg_config_home = os.environ.get('XDG_CONFIG_HOME')          if xdg_config_home: @@ -367,9 +378,9 @@ def parseOpts(overrideArguments=None):          argv = systemConf + userConf + commandLineConf          opts, args = parser.parse_args(argv)          if opts.verbose: -            sys.stderr.write(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') -            sys.stderr.write(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') -            sys.stderr.write(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') +            write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') +            write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') +            write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')      return parser, opts, args @@ -392,7 +403,7 @@ def _real_main(argv=None):          except (IOError, OSError) as err:              if opts.verbose:                  traceback.print_exc() -            sys.stderr.write(u'ERROR: unable to open cookie file\n') +            write_string(u'ERROR: unable to open cookie file\n')              sys.exit(101)      # Set user agent      if opts.user_agent is not None: @@ -419,7 +430,7 @@ def _real_main(argv=None):              batchurls = [x.strip() for x in batchurls]              batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]              if opts.verbose: -                sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') +                write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')          except IOError:              sys.exit(u'ERROR: batch file could not be read')      all_urls = batchurls + args @@ -533,6 +544,11 @@ def _real_main(argv=None):      else:          date = DateRange(opts.dateafter, opts.datebefore) +    # --all-sub automatically sets --write-sub if --write-auto-sub is not given +    # this was the old behaviour if only --all-sub was given. +    if opts.allsubtitles and (opts.writeautomaticsub == False): +        opts.writesubtitles = True +      if sys.version_info < (3,):          # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)          if opts.outtmpl is not None: @@ -545,6 +561,10 @@ def _real_main(argv=None):              or (opts.useid and u'%(id)s.%(ext)s')              or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')              or u'%(title)s-%(id)s.%(ext)s') +    if '%(ext)s' not in outtmpl and opts.extractaudio: +        parser.error(u'Cannot download a video and extract audio into the same' +                     u' file! Use "%%(ext)s" instead of %r' % +                     determine_ext(outtmpl, u''))      # YoutubeDL      ydl = YoutubeDL({ @@ -603,10 +623,12 @@ def _real_main(argv=None):          'min_filesize': opts.min_filesize,          'max_filesize': opts.max_filesize,          'daterange': date, +        'cachedir': opts.cachedir, +        'youtube_print_sig_code': opts.youtube_print_sig_code,          })      if opts.verbose: -        sys.stderr.write(u'[debug] youtube-dl version ' + __version__ + u'\n') +        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')          try:              sp = subprocess.Popen(                  ['git', 'rev-parse', '--short', 'HEAD'], @@ -615,14 +637,14 @@ def _real_main(argv=None):              out, err = sp.communicate()              out = out.decode().strip()              if re.match('[0-9a-f]+', out): -                sys.stderr.write(u'[debug] Git HEAD: ' + out + u'\n') +                write_string(u'[debug] Git HEAD: ' + out + u'\n')          except:              try:                  sys.exc_clear()              except:                  pass -        sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') -        sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') +        write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') +        write_string(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n')      ydl.add_default_info_extractors() @@ -636,7 +658,7 @@ def _real_main(argv=None):      # Update version      if opts.update_self: -        update_self(ydl.to_screen, opts.verbose, sys.argv[0]) +        update_self(ydl.to_screen, opts.verbose)      # Maybe do nothing      if len(all_urls) < 1: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f9542d2..d1b7e5f99 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .arte import ArteTvIE  from .auengine import AUEngineIE  from .bandcamp import BandcampIE  from .bliptv import BlipTVIE, BlipTVUserIE +from .bloomberg import BloombergIE  from .breakcom import BreakIE  from .brightcove import BrightcoveIE  from .c56 import C56IE @@ -17,21 +18,32 @@ from .comedycentral import ComedyCentralIE  from .condenast import CondeNastIE  from .criterion import CriterionIE  from .cspan import CSpanIE -from .dailymotion import DailymotionIE, DailymotionPlaylistIE +from .dailymotion import ( +    DailymotionIE, +    DailymotionPlaylistIE, +    DailymotionUserIE, +)  from .daum import DaumIE  from .depositfiles import DepositFilesIE  from .dotsub import DotsubIE  from .dreisat import DreiSatIE  from .defense import DefenseGouvFrIE +from .ebaumsworld import EbaumsWorldIE  from .ehow import EHowIE  from .eighttracks import EightTracksIE  from .escapist import EscapistIE  from .exfm import ExfmIE  from .facebook import FacebookIE +from .fktv import ( +    FKTVIE, +    FKTVPosteckeIE, +)  from .flickr import FlickrIE  from .francetv import (      PluzzIE,      FranceTvInfoIE, +    France2IE, +    GenerationQuoiIE  )  from .freesound import FreesoundIE  from .funnyordie import FunnyOrDieIE @@ -67,6 +79,7 @@ from .myvideo import MyVideoIE  from .naver import NaverIE  from .nba import NBAIE  from .nbc import NBCNewsIE +from .newgrounds import NewgroundsIE  from .ooyala import OoyalaIE  from .orf import ORFIE  from .pbs import PBSIE @@ -82,7 +95,8 @@ from .sina import SinaIE  from .slashdot import SlashdotIE  from .slideshare import SlideshareIE  from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE +from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .southparkstudios import SouthParkStudiosIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE  from .statigram import StatigramIE @@ -102,6 +116,7 @@ from .vbox7 import Vbox7IE  from .veehd import VeeHDIE  from .veoh import VeohIE  from .vevo import VevoIE +from .vice import ViceIE  from .videofyme import VideofyMeIE  from .vimeo import VimeoIE, VimeoChannelIE  from .vine import VineIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 8b191c196..6d6237f8a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,8 +1,10 @@  import re  import xml.etree.ElementTree +import json  from .common import InfoExtractor  from ..utils import ( +    compat_urlparse,      determine_ext,  ) @@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor):          u"playlist": [              {                  u"file": u"manofsteel-trailer4.mov", -                u"md5": u"11874af099d480cc09e103b189805d5f", +                u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8",                  u"info_dict": {                      u"duration": 111, -                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg",                      u"title": u"Trailer 4",                      u"upload_date": u"20130523",                      u"uploader_id": u"wb", @@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor):              },              {                  u"file": u"manofsteel-trailer3.mov", -                u"md5": u"07a0a262aae5afe68120eed61137ab34", +                u"md5": u"b8017b7131b721fb4e8d6f49e1df908c",                  u"info_dict": {                      u"duration": 182, -                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg",                      u"title": u"Trailer 3",                      u"upload_date": u"20130417",                      u"uploader_id": u"wb", @@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor):              },              {                  u"file": u"manofsteel-trailer.mov", -                u"md5": u"e401fde0813008e3307e54b6f384cff1", +                u"md5": u"d0f1e1150989b9924679b441f3404d48",                  u"info_dict": {                      u"duration": 148, -                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg",                      u"title": u"Trailer",                      u"upload_date": u"20121212",                      u"uploader_id": u"wb", @@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor):              },              {                  u"file": u"manofsteel-teaser.mov", -                u"md5": u"76b392f2ae9e7c98b22913c10a639c97", +                u"md5": u"5fe08795b943eb2e757fa95cb6def1cb",                  u"info_dict": {                      u"duration": 93, -                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg",                      u"title": u"Teaser",                      u"upload_date": u"20120721",                      u"uploader_id": u"wb", @@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor):          ]      } +    _JSON_RE = r'iTunes.playURL\((.*?)\);' +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          movie = mobj.group('movie')          uploader_id = mobj.group('company') -        playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' +        playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')          playlist_snippet = self._download_webpage(playlist_url, movie) -        playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) +        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) +        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) +        # The ' in the onClick attributes are not escaped, it couldn't be parsed +        # with xml.etree.ElementTree.fromstring +        # like: http://trailers.apple.com/trailers/wb/gravity/ +        def _clean_json(m): +            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') +        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)          playlist_html = u'<html>' + playlist_cleaned + u'</html>' -        size_cache = {} -          doc = xml.etree.ElementTree.fromstring(playlist_html)          playlist = []          for li in doc.findall('./div/ul/li'): -            title = li.find('.//h3').text +            on_click = li.find('.//a').attrib['onClick'] +            trailer_info_json = self._search_regex(self._JSON_RE, +                on_click, u'trailer info') +            trailer_info = json.loads(trailer_info_json) +            title = trailer_info['title']              video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()              thumbnail = li.find('.//img').attrib['src'] +            upload_date = trailer_info['posted'].replace('-', '') -            date_el = li.find('.//p') -            upload_date = None -            m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) -            if m: -                upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') -            runtime_el = date_el.find('./br') -            m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) +            runtime = trailer_info['runtime'] +            m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)              duration = None              if m:                  duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) -            formats = [] -            for formats_el in li.findall('.//a'): -                if formats_el.attrib['class'] != 'OverlayPanel': -                    continue -                target = formats_el.attrib['target'] - -                format_code = formats_el.text -                if 'Automatic' in format_code: -                    continue +            first_url = trailer_info['url'] +            trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() +            settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) +            settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') +            settings = json.loads(settings_json) -                size_q = formats_el.attrib['href'] -                size_id = size_q.rpartition('#videos-')[2] -                if size_id not in size_cache: -                    size_url = url + size_q -                    sizepage_html = self._download_webpage( -                        size_url, movie, -                        note=u'Downloading size info %s' % size_id, -                        errnote=u'Error while downloading size info %s' % size_id, -                    ) -                    _doc = xml.etree.ElementTree.fromstring(sizepage_html) -                    size_cache[size_id] = _doc - -                sizepage_doc = size_cache[size_id] -                links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') -                for vid_a in links: -                    href = vid_a.get('href') -                    if not href.endswith(target): -                        continue -                    detail_q = href.partition('#')[0] -                    detail_url = url + '/' + detail_q - -                    m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) -                    detail_id = m.group('detail_id') - -                    detail_html = self._download_webpage( -                        detail_url, movie, -                        note=u'Downloading detail %s %s' % (detail_id, size_id), -                        errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) -                    ) -                    detail_doc = xml.etree.ElementTree.fromstring(detail_html) -                    movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') -                    assert movie_link_el.get('class') == 'movieLink' -                    movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') -                    ext = determine_ext(movie_link) -                    assert ext == 'mov' - -                    formats.append({ -                        'format': format_code, -                        'ext': ext, -                        'url': movie_link, -                    }) +            formats = [] +            for format in settings['metadata']['sizes']: +                # The src is a file pointing to the real video file +                format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) +                formats.append({ +                    'url': format_url, +                    'ext': determine_ext(format_url), +                    'format': format['type'], +                    'width': format['width'], +                    'height': int(format['height']), +                }) +            formats = sorted(formats, key=lambda f: (f['height'], f['width']))              info = {                  '_type': 'video', diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 7efd1d823..61ce4469a 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor):              for fn,fdata in data['files'].items()              if 'Video' in fdata['format']]          formats.sort(key=lambda fdata: fdata['file_size']) +        for f in formats: +            f['ext'] = determine_ext(f['url'])          info = {              '_type': 'video', @@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor):              info['thumbnail'] = thumbnail          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = determine_ext(formats[-1]['url']) +        info.update(formats[-1]) -        return info
\ No newline at end of file +        return info diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py new file mode 100644 index 000000000..3666a780b --- /dev/null +++ b/youtube_dl/extractor/bloomberg.py @@ -0,0 +1,27 @@ +import re + +from .common import InfoExtractor + + +class BloombergIE(InfoExtractor): +    _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?).html' + +    _TEST = { +        u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', +        u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', +        u'info_dict': { +            u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', +            u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', +        }, +        u'params': { +            # Requires ffmpeg (m3u8 manifest) +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        name = mobj.group('name') +        webpage = self._download_webpage(url, name) +        ooyala_url = self._og_search_video_url(webpage) +        return self.url_result(ooyala_url, ie='Ooyala') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 71e3c7883..558b3d009 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,3 +1,5 @@ +# encoding: utf-8 +  import re  import json  import xml.etree.ElementTree @@ -7,15 +9,39 @@ from ..utils import (      compat_urllib_parse,      find_xpath_attr,      compat_urlparse, + +    ExtractorError,  )  class BrightcoveIE(InfoExtractor):      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'      _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' -     -    # There is a test for Brigtcove in GenericIE, that way we test both the download -    # and the detection of videos, and we don't have to find an URL that is always valid + +    _TESTS = [ +        { +            # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ +            u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', +            u'file': u'2371591881001.mp4', +            u'md5': u'9e80619e0a94663f0bdc849b4566af19', +            u'note': u'Test Brightcove downloads and detection in GenericIE', +            u'info_dict': { +                u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', +                u'uploader': u'8TV', +                u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', +            } +        }, +        { +            # From http://medianetwork.oracle.com/video/player/1785452137001 +            u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', +            u'file': u'1785452137001.flv', +            u'info_dict': { +                u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', +                u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.', +                u'uploader': u'Oracle', +            }, +        }, +    ]      @classmethod      def _build_brighcove_url(cls, object_str): @@ -72,15 +98,27 @@ class BrightcoveIE(InfoExtractor):                                      playlist_title=playlist_info['mediaCollectionDTO']['displayName'])      def _extract_video_info(self, video_info): -        renditions = video_info['renditions'] -        renditions = sorted(renditions, key=lambda r: r['size']) -        best_format = renditions[-1] +        info = { +            'id': video_info['id'], +            'title': video_info['displayName'], +            'description': video_info.get('shortDescription'), +            'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), +            'uploader': video_info.get('publisherName'), +        } -        return {'id': video_info['id'], -                'title': video_info['displayName'], -                'url': best_format['defaultURL'],  +        renditions = video_info.get('renditions') +        if renditions: +            renditions = sorted(renditions, key=lambda r: r['size']) +            best_format = renditions[-1] +            info.update({ +                'url': best_format['defaultURL'],                  'ext': 'mp4', -                'description': video_info.get('shortDescription'), -                'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), -                'uploader': video_info.get('publisherName'), -                } +            }) +        elif video_info.get('FLVFullLengthURL') is not None: +            info.update({ +                'url': video_info['FLVFullLengthURL'], +                'ext': 'flv', +            }) +        else: +            raise ExtractorError(u'Unable to extract video url for %s' % info['id']) +        return info diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 360113f9c..3f012aedc 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -14,8 +14,15 @@ from ..utils import (      ExtractorError,  ) +class DailymotionBaseInfoExtractor(InfoExtractor): +    @staticmethod +    def _build_request(url): +        """Build a request with the family filter disabled""" +        request = compat_urllib_request.Request(url) +        request.add_header('Cookie', 'family_filter=off') +        return request -class DailymotionIE(SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):      """Information Extractor for Dailymotion"""      _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' @@ -40,8 +47,7 @@ class DailymotionIE(SubtitlesInfoExtractor):          url = 'http://www.dailymotion.com/video/%s' % video_id          # Retrieve video webpage to extract further information -        request = compat_urllib_request.Request(url) -        request.add_header('Cookie', 'family_filter=off') +        request = self._build_request(url)          webpage = self._download_webpage(request, video_id)          # Extract URL, uploader and title from webpage @@ -63,6 +69,9 @@ class DailymotionIE(SubtitlesInfoExtractor):          info = self._search_regex(r'var info = ({.*?}),$', embed_page,              'video info', flags=re.MULTILINE)          info = json.loads(info) +        if info.get('error') is not None: +            msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] +            raise ExtractorError(msg, expected=True)          # TODO: support choosing qualities @@ -110,29 +119,56 @@ class DailymotionIE(SubtitlesInfoExtractor):          return {} -class DailymotionPlaylistIE(InfoExtractor): +class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): +    IE_NAME = u'dailymotion:playlist'      _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'      _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>' +    _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        playlist_id =  mobj.group('id') +    def _extract_entries(self, id):          video_ids = [] -          for pagenum in itertools.count(1): -            webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), -                                             playlist_id, u'Downloading page %s' % pagenum) +            request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum)) +            webpage = self._download_webpage(request, +                                             id, u'Downloading page %s' % pagenum)              playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)              video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el))              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:                  break - -        entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') +        return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')                     for video_id in video_ids] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        playlist_id = mobj.group('id') +        webpage = self._download_webpage(url, playlist_id) +          return {'_type': 'playlist',                  'id': playlist_id,                  'title': get_element_by_id(u'playlist_name', webpage), -                'entries': entries, +                'entries': self._extract_entries(playlist_id),                  } + + +class DailymotionUserIE(DailymotionPlaylistIE): +    IE_NAME = u'dailymotion:user' +    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)' +    _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>' +    _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        user = mobj.group('user') +        webpage = self._download_webpage(url, user) +        full_user = self._html_search_regex( +            r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user), +            webpage, u'user', flags=re.DOTALL) + +        return { +            '_type': 'playlist', +            'id': user, +            'title': full_user, +            'entries': self._extract_entries(user), +        } diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 64b465805..765cb1f37 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor):              'width': int(fe.find('./width').text),              'height': int(fe.find('./height').text),              'url': fe.find('./url').text, +            'ext': determine_ext(fe.find('./url').text),              'filesize': int(fe.find('./filesize').text),              'video_bitrate': int(fe.find('./videoBitrate').text),              '3sat_qualityname': fe.find('./quality').text, @@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor):          }          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = determine_ext(formats[-1]['url']) +        info.update(formats[-1]) -        return info
\ No newline at end of file +        return info diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py new file mode 100644 index 000000000..f02c6998b --- /dev/null +++ b/youtube_dl/extractor/ebaumsworld.py @@ -0,0 +1,37 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class EbaumsWorldIE(InfoExtractor): +    _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://www.ebaumsworld.com/video/watch/83367677/', +        u'file': u'83367677.mp4', +        u'info_dict': { +            u'title': u'A Giant Python Opens The Door', +            u'description': u'This is how nightmares start...', +            u'uploader': u'jihadpizza', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        config_xml = self._download_webpage( +            'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) +        config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) +        video_url = config.find('file').text + +        return { +            'id': video_id, +            'title': config.find('title').text, +            'url': video_url, +            'ext': determine_ext(video_url), +            'description': config.find('description').text, +            'thumbnail': config.find('image').text, +            'uploader': config.find('username').text, +        } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index beaa5b4bd..9d1bc0751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -106,8 +106,8 @@ class FacebookIE(InfoExtractor):          video_duration = int(video_data['video_duration'])          thumbnail = video_data['thumbnail_src'] -        video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>', -            webpage, u'title') +        video_title = self._html_search_regex( +            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')          info = {              'id': video_id, diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py new file mode 100644 index 000000000..9c89362ef --- /dev/null +++ b/youtube_dl/extractor/fktv.py @@ -0,0 +1,79 @@ +import re +import random +import json + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    get_element_by_id, +    clean_html, +) + + +class FKTVIE(InfoExtractor): +    IE_NAME = u'fernsehkritik.tv' +    _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + +    _TEST = { +        u'url': u'http://fernsehkritik.tv/folge-1', +        u'file': u'00011.flv', +        u'info_dict': { +            u'title': u'Folge 1 vom 10. April 2007', +            u'description': u'md5:fb4818139c7cfe6907d4b83412a6864f', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        episode = int(mobj.group('ep')) + +        server = random.randint(2, 4) +        video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode +        start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, +            episode) +        playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage, +            u'playlist', flags=re.DOTALL) +        files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) +        # TODO: return a single multipart video +        videos = [] +        for i, _ in enumerate(files, 1): +            video_id = '%04d%d' % (episode, i) +            video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) +            video_title = 'Fernsehkritik %d.%d' % (episode, i) +            videos.append({ +                'id': video_id, +                'url': video_url, +                'ext': determine_ext(video_url), +                'title': clean_html(get_element_by_id('eptitle', start_webpage)), +                'description': clean_html(get_element_by_id('contentlist', start_webpage)), +                'thumbnail': video_thumbnail +            }) +        return videos + + +class FKTVPosteckeIE(InfoExtractor): +    IE_NAME = u'fernsehkritik.tv:postecke' +    _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' +    _TEST = { +        u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', +        u'file': u'0120.flv', +        u'md5': u'262f0adbac80317412f7e57b4808e5c4', +        u'info_dict': { +            u"title": u"Postecke 120" +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        episode = int(mobj.group('ep')) + +        server = random.randint(2, 4) +        video_id = '%04d' % episode +        video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode) +        video_title = 'Postecke %d' % episode +        return { +            'id':       video_id, +            'url':      video_url, +            'ext':      determine_ext(video_url), +            'title':    video_title, +        } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f2b12c884..b1530e549 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,6 +1,7 @@  # encoding: utf-8  import re  import xml.etree.ElementTree +import json  from .common import InfoExtractor  from ..utils import ( @@ -34,17 +35,7 @@ class PluzzIE(FranceTVBaseInfoExtractor):      IE_NAME = u'pluzz.francetv.fr'      _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' -    _TEST = { -        u'url': u'http://pluzz.francetv.fr/videos/allo_rufo_saison5_,88439064.html', -        u'file': u'88439064.mp4', -        u'info_dict': { -            u'title': u'Allô Rufo', -            u'description': u'md5:d909f1ebdf963814b65772aea250400e', -        }, -        u'params': { -            u'skip_download': True, -        }, -    } +    # Can't use tests, videos expire in 7 days      def _real_extract(self, url):          title = re.match(self._VALID_URL, url).group(1) @@ -75,3 +66,52 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):          webpage = self._download_webpage(url, page_title)          video_id = self._search_regex(r'id-video=(\d+?)"', webpage, u'video id')          return self._extract_video(video_id) + + +class France2IE(FranceTVBaseInfoExtractor): +    IE_NAME = u'france2.fr' +    _VALID_URL = r'https?://www\.france2\.fr/emissions/.*?/videos/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', +        u'file': u'75540104.mp4', +        u'info_dict': { +            u'title': u'13h15, le samedi...', +            u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', +        }, +        u'params': { +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        return self._extract_video(video_id) + + +class GenerationQuoiIE(InfoExtractor): +    IE_NAME = u'http://generation-quoi.france2.fr' +    _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)' + +    _TEST = { +        u'url': u'http://generation-quoi.france2.fr/portrait/garde-a-vous', +        u'file': u'k7FJX8VBcvvLmX4wA5Q.mp4', +        u'info_dict': { +            u'title': u'Génération Quoi - Garde à Vous', +            u'uploader': u'Génération Quoi', +        }, +        u'params': { +            # It uses Dailymotion +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        name = mobj.group('name') +        info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % name) +        info_json = self._download_webpage(info_url, name) +        info = json.loads(info_json) +        return self.url_result('http://www.dailymotion.com/video/%s' % info['id'], +            ie='Dailymotion') diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index f3d86a711..2ccdb7073 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,8 @@ class FunnyOrDieIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        video_url = self._search_regex(r'type="video/mp4" src="(.*?)"', +        video_url = self._search_regex( +            [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''],              webpage, u'video URL', flags=re.DOTALL)          info = { diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f92e61fea..764070635 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,17 +29,6 @@ class GenericIE(InfoExtractor):                  u"title": u"R\u00e9gis plante sa Jeep"              }          }, -        { -            u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', -            u'file': u'2371591881001.mp4', -            u'md5': u'9e80619e0a94663f0bdc849b4566af19', -            u'note': u'Test Brightcove downloads and detection in GenericIE', -            u'info_dict': { -                u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', -                u'uploader': u'8TV', -                u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', -            } -        },      ]      def report_download_webpage(self, video_id): diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index f1cd88983..8895ad289 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor):          self.report_extraction(video_id)          # Extract update date -        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', +        upload_date = self._html_search_regex( +            ['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'],              webpage, u'upload date', fatal=False)          if upload_date:              # Convert timestring to a format suitable for filename diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ccca1d7e0..3798118a7 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -7,11 +7,11 @@ from .common import InfoExtractor  class HotNewHipHopIE(InfoExtractor):      _VALID_URL = r'http://www\.hotnewhiphop.com/.*\.(?P<id>.*)\.html'      _TEST = { -        u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html'", +        u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html",          u'file': u'1435540.mp3',          u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96',          u'info_dict': { -            u"title": u"Freddie Gibbs Songs - Lay It Down" +            u"title": u"Freddie Gibbs - Lay It Down"          }      } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 309921078..d04da98c8 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -2,7 +2,12 @@ import re  import json  from .common import InfoExtractor -from ..utils import compat_urllib_parse_urlparse, compat_urlparse +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urlparse, +    get_meta_content, +    ExtractorError, +)  class LivestreamIE(InfoExtractor): @@ -35,8 +40,11 @@ class LivestreamIE(InfoExtractor):          if video_id is None:              # This is an event page: -            api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', -                                         webpage, 'api url') +            player = get_meta_content('twitter:player', webpage) +            if player is None: +                raise ExtractorError('Couldn\'t extract event api url') +            api_url = player.replace('/player', '') +            api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url)              info = json.loads(self._download_webpage(api_url, event_name,                                                       u'Downloading event info'))              videos = [self._extract_video_info(video_data['data']) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 8245b5583..a200dcd74 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,34 +5,27 @@ import socket  from .common import InfoExtractor  from ..utils import (      compat_http_client, -    compat_str,      compat_urllib_error,      compat_urllib_request, - -    ExtractorError, +    unified_strdate,  )  class MixcloudIE(InfoExtractor): -    _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/      _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'      IE_NAME = u'mixcloud' -    def report_download_json(self, file_id): -        """Report JSON download.""" -        self.to_screen(u'Downloading json') - -    def get_urls(self, jsonData, fmt, bitrate='best'): -        """Get urls from 'audio_formats' section in json""" -        try: -            bitrate_list = jsonData[fmt] -            if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: -                bitrate = max(bitrate_list) # select highest - -            url_list = jsonData[fmt][bitrate] -        except TypeError: # we have no bitrate info. -            url_list = jsonData[fmt] -        return url_list +    _TEST = { +        u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', +        u'file': u'dholbach-cryptkeeper.mp3', +        u'info_dict': { +            u'title': u'Cryptkeeper', +            u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', +            u'uploader': u'Daniel Holbach', +            u'uploader_id': u'dholbach', +            u'upload_date': u'20111115', +        }, +    }      def check_urls(self, url_list):          """Returns 1st active url from list""" @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor):          return None -    def _print_formats(self, formats): -        print('Available formats:') -        for fmt in formats.keys(): -            for b in formats[fmt]: -                try: -                    ext = formats[fmt][b][0] -                    print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) -                except TypeError: # we have no bitrate info -                    ext = formats[fmt][0] -                    print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) -                    break -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) -        # extract uploader & filename from url -        uploader = mobj.group(1).decode('utf-8') -        file_id = uploader + "-" + mobj.group(2).decode('utf-8') - -        # construct API request -        file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' -        # retrieve .json file with links to files -        request = compat_urllib_request.Request(file_url) -        try: -            self.report_download_json(file_url) -            jsonData = compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) - -        # parse JSON -        json_data = json.loads(jsonData) -        player_url = json_data['player_swf_url'] -        formats = dict(json_data['audio_formats']) - -        req_format = self._downloader.params.get('format', None) - -        if self._downloader.params.get('listformats', None): -            self._print_formats(formats) -            return - -        if req_format is None or req_format == 'best': -            for format_param in formats.keys(): -                url_list = self.get_urls(formats, format_param) -                # check urls -                file_url = self.check_urls(url_list) -                if file_url is not None: -                    break # got it! -        else: -            if req_format not in formats: -                raise ExtractorError(u'Format is not available') - -            url_list = self.get_urls(formats, req_format) -            file_url = self.check_urls(url_list) -            format_param = req_format -        return [{ -            'id': file_id.decode('utf-8'), -            'url': file_url.decode('utf-8'), -            'uploader': uploader.decode('utf-8'), -            'upload_date': None, -            'title': json_data['name'], -            'ext': file_url.split('.')[-1].decode('utf-8'), -            'format': (format_param is None and u'NA' or format_param.decode('utf-8')), -            'thumbnail': json_data['thumbnail_url'], -            'description': json_data['description'], -            'player_url': player_url.decode('utf-8'), -        }] +        uploader = mobj.group(1) +        cloudcast_name = mobj.group(2) +        track_id = '-'.join((uploader, cloudcast_name)) +        api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) +        webpage = self._download_webpage(url, track_id) +        json_data = self._download_webpage(api_url, track_id, +            u'Downloading cloudcast info') +        info = json.loads(json_data) + +        preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') +        song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') +        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) +        final_song_url = self.check_urls(template_url % i for i in range(30)) + +        return { +            'id': track_id, +            'title': info['name'], +            'url': final_song_url, +            'ext': 'mp3', +            'description': info['description'], +            'thumbnail': info['pictures'].get('extra_large'), +            'uploader': info['user']['name'], +            'uploader_id': info['user']['username'], +            'upload_date': unified_strdate(info['created_time']), +            'view_count': info['play_count'], +        } diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py new file mode 100644 index 000000000..2ef80bce0 --- /dev/null +++ b/youtube_dl/extractor/newgrounds.py @@ -0,0 +1,38 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class NewgroundsIE(InfoExtractor): +    _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P<id>\d+)' +    _TEST = { +        u'url': u'http://www.newgrounds.com/audio/listen/549479', +        u'file': u'549479.mp3', +        u'md5': u'fe6033d297591288fa1c1f780386f07a', +        u'info_dict': { +            u"title": u"B7 - BusMode", +            u"uploader": u"Burn7", +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        music_id = mobj.group('id') +        webpage = self._download_webpage(url, music_id) +         +        title = self._html_search_regex(r',"name":"([^"]+)",', webpage, u'music title') +        uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, u'music uploader') +         +        music_url_json_string = self._html_search_regex(r'({"url":"[^"]+"),', webpage, u'music url') + '}' +        music_url_json = json.loads(music_url_json_string) +        music_url = music_url_json['url'] + +        return { +            'id':       music_id, +            'title':    title, +            'url':      music_url, +            'uploader': uploader, +            'ext':      determine_ext(music_url), +        } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index b734722d0..1f7b4d2e7 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -18,11 +18,15 @@ class OoyalaIE(InfoExtractor):          },      } +    @staticmethod +    def _url_for_embed_code(embed_code): +        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code +      def _extract_result(self, info, more_info):          return {'id': info['embedCode'],                  'ext': 'mp4',                  'title': unescapeHTML(info['title']), -                'url': info['url'], +                'url': info.get('ipad_url') or info['url'],                  'description': unescapeHTML(more_info['description']),                  'thumbnail': more_info['promo'],                  } @@ -35,7 +39,9 @@ class OoyalaIE(InfoExtractor):          mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',                                          player, u'mobile player url')          mobile_player = self._download_webpage(mobile_url, embedCode) -        videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') +        videos_info = self._search_regex( +            r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', +            mobile_player, u'info').replace('\\"','"')          videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"')          videos_info = json.loads(videos_info)          videos_more_info =json.loads(videos_more_info) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 7bb236c2b..32541077f 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -8,8 +8,8 @@ from ..utils import (  )  class RTLnowIE(InfoExtractor): -    """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" -    _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' +    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW and VOX NOW""" +    _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'      _TESTS = [{          u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',          u'file': u'90419.flv', @@ -61,6 +61,19 @@ class RTLnowIE(InfoExtractor):          u'params': {              u'skip_download': True,          }, +    }, +    { +        u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1', +        u'file': u'127367.flv', +        u'info_dict': { +            u'upload_date': u'20130926',  +            u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...', +            u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin', +            u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg', +        }, +        u'params': { +            u'skip_download': True, +        },      }]      def _real_extract(self,url): @@ -79,7 +92,7 @@ class RTLnowIE(InfoExtractor):              msg = clean_html(note_m.group(1))              raise ExtractorError(msg) -        video_title = self._html_search_regex(r'<title>(?P<title>[^<]+)</title>', +        video_title = self._html_search_regex(r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>',              webpage, u'title')          playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'',              webpage, u'playerdata_url') diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5f3a5540d..29cd5617c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,10 +1,12 @@  import json  import re +import itertools  from .common import InfoExtractor  from ..utils import (      compat_str,      compat_urlparse, +    compat_urllib_parse,      ExtractorError,      unified_strdate, @@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor):      def _resolv_url(cls, url):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID -    def _extract_info_dict(self, info, full_title=None): +    def _extract_info_dict(self, info, full_title=None, quiet=False):          video_id = info['id']          name = full_title or video_id -        self.report_extraction(name) +        if quiet == False: +            self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None: @@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE):                  'id': info['id'],                  'title': info['title'],                  } + + +class SoundcloudUserIE(SoundcloudIE): +    _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' +    IE_NAME = u'soundcloud:user' + +    # it's in tests/test_playlists.py +    _TEST = None + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        uploader = mobj.group('user') + +        url = 'http://soundcloud.com/%s/' % uploader +        resolv_url = self._resolv_url(url) +        user_json = self._download_webpage(resolv_url, uploader, +            u'Downloading user info') +        user = json.loads(user_json) + +        tracks = [] +        for i in itertools.count(): +            data = compat_urllib_parse.urlencode({'offset': i*50, +                                                  'client_id': self._CLIENT_ID, +                                                  }) +            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data +            response = self._download_webpage(tracks_url, uploader,  +                u'Downloading tracks page %s' % (i+1)) +            new_tracks = json.loads(response) +            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks) +            if len(new_tracks) < 50: +                break + +        return { +            '_type': 'playlist', +            'id': compat_str(user['id']), +            'title': user['username'], +            'entries': tracks, +        } diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py new file mode 100644 index 000000000..b1e96b679 --- /dev/null +++ b/youtube_dl/extractor/southparkstudios.py @@ -0,0 +1,38 @@ +import re + +from .mtv import MTVIE, _media_xml_tag + + +class SouthParkStudiosIE(MTVIE): +    IE_NAME = u'southparkstudios.com' +    _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$)' + +    _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + +    _TEST = { +        u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', +        u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', +        u'info_dict': { +            u'title': u'Bat Daded', +            u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', +        }, +    } + +    # Overwrite MTVIE properties we don't want +    _TESTS = [] + +    def _get_thumbnail_url(self, uri, itemdoc): +        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) +        thumb_node = itemdoc.find(search_path) +        if thumb_node is None: +            return None +        else: +            return thumb_node.attrib['url'] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', +                                  webpage, u'mgid') +        return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 97215f289..90de7de3a 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor):      @property      def _have_to_download_any_subtitles(self):          return any([self._downloader.params.get('writesubtitles', False), -                    self._downloader.params.get('writeautomaticsub'), -                    self._downloader.params.get('allsubtitles', False)]) +                    self._downloader.params.get('writeautomaticsub')])      def _list_available_subtitles(self, video_id, webpage=None):          """ outputs the available subtitles for the video """ @@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor):          available_subs_list = {}          if self._downloader.params.get('writeautomaticsub', False):              available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) -        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): +        if self._downloader.params.get('writesubtitles', False):              available_subs_list.update(self._get_available_subtitles(video_id))          if not available_subs_list:  # error, it didn't get the available subtitles diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index f278951ba..0bf028f61 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor):              {                  'format': fnode.text,                  'url': video_url_template % fnode.text, +                'ext': fnode.text.partition('-')[0]              }              for fnode in format_doc.findall('./formats/format') @@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor):          }          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = formats[-1]['format'].partition('-')[0] +        info.update(formats[-1])          return info diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py new file mode 100644 index 000000000..6b93afa50 --- /dev/null +++ b/youtube_dl/extractor/vice.py @@ -0,0 +1,38 @@ +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ExtractorError + + +class ViceIE(InfoExtractor): +    _VALID_URL = r'http://www.vice.com/.*?/(?P<name>.+)' + +    _TEST = { +        u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1', +        u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4', +        u'info_dict': { +            u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', +        }, +        u'params': { +            # Requires ffmpeg (m3u8 manifest) +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        name = mobj.group('name') +        webpage = self._download_webpage(url, name) +        try: +            ooyala_url = self._og_search_video_url(webpage) +        except ExtractorError: +            try: +                embed_code = self._search_regex( +                    r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage, +                    u'ooyala embed code') +                ooyala_url = OoyalaIE._url_for_embed_code(embed_code) +            except ExtractorError: +                raise ExtractorError(u'The page doesn\'t contain a video', expected=True) +        return self.url_result(ooyala_url, ie='Ooyala') + diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 88b8b6be0..361619694 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,8 +11,8 @@ from ..utils import (  class XHamsterIE(InfoExtractor):      """Information Extractor for xHamster""" -    _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' -    _TEST = { +    _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' +    _TESTS = [{          u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',          u'file': u'1509445.flv',          u'md5': u'9f48e0e8d58e3076bb236ff412ab62fa', @@ -21,13 +21,24 @@ class XHamsterIE(InfoExtractor):              u"uploader_id": u"Ruseful2011",               u"title": u"FemaleAgent Shy beauty takes the bait"          } -    } +    }, +    { +        u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', +        u'file': u'2221348.flv', +        u'md5': u'e767b9475de189320f691f49c679c4c7', +        u'info_dict': { +            u"upload_date": u"20130914",  +            u"uploader_id": u"jojo747400",  +            u"title": u"Britney Spears  Sexy Booty" +        } +    }]      def _real_extract(self,url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id +        seo = mobj.group('seo') +        mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo)          webpage = self._download_webpage(mrss_url, video_id)          mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 32d5b9477..39126e631 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,4 +1,3 @@ -import datetime  import itertools  import json  import re @@ -6,86 +5,85 @@ import re  from .common import InfoExtractor, SearchInfoExtractor  from ..utils import (      compat_urllib_parse, - -    ExtractorError, +    compat_urlparse, +    determine_ext, +    clean_html,  ) +  class YahooIE(InfoExtractor):      IE_DESC = u'Yahoo screen'      _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' -    _TEST = { -        u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', -        u'file': u'214727115.flv', -        u'md5': u'2e717f169c1be93d84d3794a00d4a325', -        u'info_dict': { -            u"title": u"Julian Smith & Travis Legg Watch Julian Smith" +    _TESTS = [ +        { +            u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', +            u'file': u'214727115.mp4', +            u'info_dict': { +                u'title': u'Julian Smith & Travis Legg Watch Julian Smith', +                u'description': u'Julian and Travis watch Julian Smith', +            },          }, -        u'skip': u'Requires rtmpdump' -    } +        { +            u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', +            u'file': u'103000935.flv', +            u'info_dict': { +                u'title': u'The Cougar Lies with Spanish Moss', +                u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', +            }, +            u'params': { +                # Requires rtmpdump +                u'skip_download': True, +            }, +        }, +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url)          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage) -        if m_id is None:  -            # TODO: Check which url parameters are required -            info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id -            webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') -            info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.* -                        <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.* -                        <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.* -                        <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB" -                        ''' -            self.report_extraction(video_id) -            m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL) -            if m_info is None: -                raise ExtractorError(u'Unable to extract video info') -            video_title = m_info.group('title') -            video_description = m_info.group('description') -            video_thumb = m_info.group('thumb') -            video_date = m_info.group('date') -            video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d') -     -            # TODO: Find a way to get mp4 videos -            rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id -            webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage') -            m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage) -            video_url = m_rest.group('url') -            video_path = m_rest.group('path') -            if m_rest is None: -                raise ExtractorError(u'Unable to extract video url') +        items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$', +            webpage, u'items', flags=re.MULTILINE) +        items = json.loads(items_json) +        info = items['mediaItems']['query']['results']['mediaObj'][0] +        meta = info['meta'] + +        formats = [] +        for s in info['streams']: +            format_info = { +                'width': s.get('width'), +                'height': s.get('height'), +                'bitrate': s.get('bitrate'), +            } + +            host = s['host'] +            path = s['path'] +            if host.startswith('rtmp'): +                format_info.update({ +                    'url': host, +                    'play_path': path, +                    'ext': 'flv', +                }) +            else: +                format_url = compat_urlparse.urljoin(host, path) +                format_info['url'] = format_url +                format_info['ext'] = determine_ext(format_url) +                 +            formats.append(format_info) +        formats = sorted(formats, key=lambda f:(f['height'], f['width'])) + +        info = { +            'id': video_id, +            'title': meta['title'], +            'formats': formats, +            'description': clean_html(meta['description']), +            'thumbnail': meta['thumbnail'], +        } +        # TODO: Remove when #980 has been merged +        info.update(formats[-1]) -        else: # We have to use a different method if another id is defined -            long_id = m_id.group('new_id') -            info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335' -            webpage = self._download_webpage(info_url, video_id, u'Downloading info json') -            json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1) -            info = json.loads(json_str) -            res = info[u'query'][u'results'][u'mediaObj'][0] -            stream = res[u'streams'][0] -            video_path = stream[u'path'] -            video_url = stream[u'host'] -            meta = res[u'meta'] -            video_title = meta[u'title'] -            video_description = meta[u'description'] -            video_thumb = meta[u'thumbnail'] -            video_date = None # I can't find it +        return info -        info_dict = { -                     'id': video_id, -                     'url': video_url, -                     'play_path': video_path, -                     'title':video_title, -                     'description': video_description, -                     'thumbnail': video_thumb, -                     'upload_date': video_date, -                     'ext': 'flv', -                     } -        return info_dict  class YahooSearchIE(SearchInfoExtractor):      IE_DESC = u'Yahoo screen search' diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 996d38478..00fa2ccb5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -66,6 +66,12 @@ class YoukuIE(InfoExtractor):          self.report_extraction(video_id)          try:              config = json.loads(jsondata) +            error_code = config['data'][0].get('error_code') +            if error_code: +                # -8 means blocked outside China. +                error = config['data'][0].get('error')  # Chinese and English, separated by newline. +                raise ExtractorError(error or u'Server reported error %i' % error_code, +                    expected=True)              video_title =  config['data'][0]['title']              seed = config['data'][0]['seed'] @@ -89,6 +95,7 @@ class YoukuIE(InfoExtractor):              fileid = config['data'][0]['streamfileids'][format]              keys = [s['k'] for s in config['data'][0]['segs'][format]] +            # segs is usually a dictionary, but an empty *list* if an error occured.          except (UnicodeDecodeError, ValueError, KeyError):              raise ExtractorError(u'Unable to extract info section') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f49665925..53f13b516 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,15 +1,23 @@  # coding: utf-8 +import collections +import errno +import io +import itertools  import json -import netrc +import os.path  import re  import socket -import itertools +import string +import struct +import traceback  import xml.etree.ElementTree +import zlib  from .common import InfoExtractor, SearchInfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..utils import ( +    compat_chr,      compat_http_client,      compat_parse_qs,      compat_urllib_error, @@ -23,6 +31,7 @@ from ..utils import (      unescapeHTML,      unified_strdate,      orderedSet, +    write_json_file,  )  class YoutubeBaseInfoExtractor(InfoExtractor): @@ -139,7 +148,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                       (                           (?:https?://)?                                       # http(s):// (optional)                           (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| -                            tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains +                            tube\.majestyc\.net/| +                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls                           (?:                                                  # the various things that can precede the ID:                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/ @@ -351,7 +361,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              u"info_dict": {                  u"upload_date": u"20120506",                  u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", -                u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", +                u"description": u"md5:5b292926389560516e384ac437c0ec07",                  u"uploader": u"Icona Pop",                  u"uploader_id": u"IconaPop"              } @@ -368,21 +378,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  u"uploader_id": u"justintimberlakeVEVO"              }          }, -        { -            u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE', -            u'file': u'TGi3HqYrWHE.mp4', -            u'note': u'm3u8 video', -            u'info_dict': { -                u'title': u'Triathlon - Men - London 2012 Olympic Games', -                u'description': u'- Men -  TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games', -                u'uploader': u'olympic', -                u'upload_date': u'20120807', -                u'uploader_id': u'olympic', -            }, -            u'params': { -                u'skip_download': True, -            }, -        },      ] @@ -392,6 +387,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          if YoutubePlaylistIE.suitable(url): return False          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None +    def __init__(self, *args, **kwargs): +        super(YoutubeIE, self).__init__(*args, **kwargs) +        self._player_cache = {} +      def report_video_webpage_download(self, video_id):          """Report attempt to download video webpage."""          self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -412,11 +411,664 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          """Indicate the download will use the RTMP protocol."""          self.to_screen(u'RTMP download detected') -    def _decrypt_signature(self, s): +    def _extract_signature_function(self, video_id, player_url, slen): +        id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$', +                        player_url) +        player_type = id_m.group('ext') +        player_id = id_m.group('id') + +        # Read from filesystem cache +        func_id = '%s_%s_%d' % (player_type, player_id, slen) +        assert os.path.basename(func_id) == func_id +        cache_dir = self._downloader.params.get('cachedir', +                                                u'~/.youtube-dl/cache') + +        cache_enabled = cache_dir is not None +        if cache_enabled: +            cache_fn = os.path.join(os.path.expanduser(cache_dir), +                                    u'youtube-sigfuncs', +                                    func_id + '.json') +            try: +                with io.open(cache_fn, 'r', encoding='utf-8') as cachef: +                    cache_spec = json.load(cachef) +                return lambda s: u''.join(s[i] for i in cache_spec) +            except IOError: +                pass  # No cache available + +        if player_type == 'js': +            code = self._download_webpage( +                player_url, video_id, +                note=u'Downloading %s player %s' % (player_type, player_id), +                errnote=u'Download of %s failed' % player_url) +            res = self._parse_sig_js(code) +        elif player_type == 'swf': +            urlh = self._request_webpage( +                player_url, video_id, +                note=u'Downloading %s player %s' % (player_type, player_id), +                errnote=u'Download of %s failed' % player_url) +            code = urlh.read() +            res = self._parse_sig_swf(code) +        else: +            assert False, 'Invalid player type %r' % player_type + +        if cache_enabled: +            try: +                test_string = u''.join(map(compat_chr, range(slen))) +                cache_res = res(test_string) +                cache_spec = [ord(c) for c in cache_res] +                try: +                    os.makedirs(os.path.dirname(cache_fn)) +                except OSError as ose: +                    if ose.errno != errno.EEXIST: +                        raise +                write_json_file(cache_spec, cache_fn) +            except Exception: +                tb = traceback.format_exc() +                self._downloader.report_warning( +                    u'Writing cache to %r failed: %s' % (cache_fn, tb)) + +        return res + +    def _print_sig_code(self, func, slen): +        def gen_sig_code(idxs): +            def _genslice(start, end, step): +                starts = u'' if start == 0 else str(start) +                ends = (u':%d' % (end+step)) if end + step >= 0 else u':' +                steps = u'' if step == 1 else (u':%d' % step) +                return u's[%s%s%s]' % (starts, ends, steps) + +            step = None +            start = '(Never used)'  # Quelch pyflakes warnings - start will be +                                    # set as soon as step is set +            for i, prev in zip(idxs[1:], idxs[:-1]): +                if step is not None: +                    if i - prev == step: +                        continue +                    yield _genslice(start, prev, step) +                    step = None +                    continue +                if i - prev in [-1, 1]: +                    step = i - prev +                    start = prev +                    continue +                else: +                    yield u's[%d]' % prev +            if step is None: +                yield u's[%d]' % i +            else: +                yield _genslice(start, i, step) + +        test_string = u''.join(map(compat_chr, range(slen))) +        cache_res = func(test_string) +        cache_spec = [ord(c) for c in cache_res] +        expr_code = u' + '.join(gen_sig_code(cache_spec)) +        code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code) +        self.to_screen(u'Extracted signature function:\n' + code) + +    def _parse_sig_js(self, jscode): +        funcname = self._search_regex( +            r'signature=([a-zA-Z]+)', jscode, +            u'Initial JS player signature function name') + +        functions = {} + +        def argidx(varname): +            return string.lowercase.index(varname) + +        def interpret_statement(stmt, local_vars, allow_recursion=20): +            if allow_recursion < 0: +                raise ExtractorError(u'Recursion limit reached') + +            if stmt.startswith(u'var '): +                stmt = stmt[len(u'var '):] +            ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' + +                             r'=(?P<expr>.*)$', stmt) +            if ass_m: +                if ass_m.groupdict().get('index'): +                    def assign(val): +                        lvar = local_vars[ass_m.group('out')] +                        idx = interpret_expression(ass_m.group('index'), +                                                   local_vars, allow_recursion) +                        assert isinstance(idx, int) +                        lvar[idx] = val +                        return val +                    expr = ass_m.group('expr') +                else: +                    def assign(val): +                        local_vars[ass_m.group('out')] = val +                        return val +                    expr = ass_m.group('expr') +            elif stmt.startswith(u'return '): +                assign = lambda v: v +                expr = stmt[len(u'return '):] +            else: +                raise ExtractorError( +                    u'Cannot determine left side of statement in %r' % stmt) + +            v = interpret_expression(expr, local_vars, allow_recursion) +            return assign(v) + +        def interpret_expression(expr, local_vars, allow_recursion): +            if expr.isdigit(): +                return int(expr) + +            if expr.isalpha(): +                return local_vars[expr] + +            m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) +            if m: +                member = m.group('member') +                val = local_vars[m.group('in')] +                if member == 'split("")': +                    return list(val) +                if member == 'join("")': +                    return u''.join(val) +                if member == 'length': +                    return len(val) +                if member == 'reverse()': +                    return val[::-1] +                slice_m = re.match(r'slice\((?P<idx>.*)\)', member) +                if slice_m: +                    idx = interpret_expression( +                        slice_m.group('idx'), local_vars, allow_recursion-1) +                    return val[idx:] + +            m = re.match( +                r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) +            if m: +                val = local_vars[m.group('in')] +                idx = interpret_expression(m.group('idx'), local_vars, +                                           allow_recursion-1) +                return val[idx] + +            m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) +            if m: +                a = interpret_expression(m.group('a'), +                                         local_vars, allow_recursion) +                b = interpret_expression(m.group('b'), +                                         local_vars, allow_recursion) +                return a % b + +            m = re.match( +                r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr) +            if m: +                fname = m.group('func') +                if fname not in functions: +                    functions[fname] = extract_function(fname) +                argvals = [int(v) if v.isdigit() else local_vars[v] +                           for v in m.group('args').split(',')] +                return functions[fname](argvals) +            raise ExtractorError(u'Unsupported JS expression %r' % expr) + +        def extract_function(funcname): +            func_m = re.search( +                r'function ' + re.escape(funcname) + +                r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', +                jscode) +            argnames = func_m.group('args').split(',') + +            def resf(args): +                local_vars = dict(zip(argnames, args)) +                for stmt in func_m.group('code').split(';'): +                    res = interpret_statement(stmt, local_vars) +                return res +            return resf + +        initial_function = extract_function(funcname) +        return lambda s: initial_function([s]) + +    def _parse_sig_swf(self, file_contents): +        if file_contents[1:3] != b'WS': +            raise ExtractorError( +                u'Not an SWF file; header is %r' % file_contents[:3]) +        if file_contents[:1] == b'C': +            content = zlib.decompress(file_contents[8:]) +        else: +            raise NotImplementedError(u'Unsupported compression format %r' % +                                      file_contents[:1]) + +        def extract_tags(content): +            pos = 0 +            while pos < len(content): +                header16 = struct.unpack('<H', content[pos:pos+2])[0] +                pos += 2 +                tag_code = header16 >> 6 +                tag_len = header16 & 0x3f +                if tag_len == 0x3f: +                    tag_len = struct.unpack('<I', content[pos:pos+4])[0] +                    pos += 4 +                assert pos+tag_len <= len(content) +                yield (tag_code, content[pos:pos+tag_len]) +                pos += tag_len + +        code_tag = next(tag +                        for tag_code, tag in extract_tags(content) +                        if tag_code == 82) +        p = code_tag.index(b'\0', 4) + 1 +        code_reader = io.BytesIO(code_tag[p:]) + +        # Parse ABC (AVM2 ByteCode) +        def read_int(reader=None): +            if reader is None: +                reader = code_reader +            res = 0 +            shift = 0 +            for _ in range(5): +                buf = reader.read(1) +                assert len(buf) == 1 +                b = struct.unpack('<B', buf)[0] +                res = res | ((b & 0x7f) << shift) +                if b & 0x80 == 0: +                    break +                shift += 7 +            return res + +        def u30(reader=None): +            res = read_int(reader) +            assert res & 0xf0000000 == 0 +            return res +        u32 = read_int + +        def s32(reader=None): +            v = read_int(reader) +            if v & 0x80000000 != 0: +                v = - ((v ^ 0xffffffff) + 1) +            return v + +        def read_string(reader=None): +            if reader is None: +                reader = code_reader +            slen = u30(reader) +            resb = reader.read(slen) +            assert len(resb) == slen +            return resb.decode('utf-8') + +        def read_bytes(count, reader=None): +            if reader is None: +                reader = code_reader +            resb = reader.read(count) +            assert len(resb) == count +            return resb + +        def read_byte(reader=None): +            resb = read_bytes(1, reader=reader) +            res = struct.unpack('<B', resb)[0] +            return res + +        # minor_version + major_version +        read_bytes(2 + 2) + +        # Constant pool +        int_count = u30() +        for _c in range(1, int_count): +            s32() +        uint_count = u30() +        for _c in range(1, uint_count): +            u32() +        double_count = u30() +        read_bytes((double_count-1) * 8) +        string_count = u30() +        constant_strings = [u''] +        for _c in range(1, string_count): +            s = read_string() +            constant_strings.append(s) +        namespace_count = u30() +        for _c in range(1, namespace_count): +            read_bytes(1)  # kind +            u30()  # name +        ns_set_count = u30() +        for _c in range(1, ns_set_count): +            count = u30() +            for _c2 in range(count): +                u30() +        multiname_count = u30() +        MULTINAME_SIZES = { +            0x07: 2,  # QName +            0x0d: 2,  # QNameA +            0x0f: 1,  # RTQName +            0x10: 1,  # RTQNameA +            0x11: 0,  # RTQNameL +            0x12: 0,  # RTQNameLA +            0x09: 2,  # Multiname +            0x0e: 2,  # MultinameA +            0x1b: 1,  # MultinameL +            0x1c: 1,  # MultinameLA +        } +        multinames = [u''] +        for _c in range(1, multiname_count): +            kind = u30() +            assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind +            if kind == 0x07: +                u30()  # namespace_idx +                name_idx = u30() +                multinames.append(constant_strings[name_idx]) +            else: +                multinames.append('[MULTINAME kind: %d]' % kind) +                for _c2 in range(MULTINAME_SIZES[kind]): +                    u30() + +        # Methods +        method_count = u30() +        MethodInfo = collections.namedtuple( +            'MethodInfo', +            ['NEED_ARGUMENTS', 'NEED_REST']) +        method_infos = [] +        for method_id in range(method_count): +            param_count = u30() +            u30()  # return type +            for _ in range(param_count): +                u30()  # param type +            u30()  # name index (always 0 for youtube) +            flags = read_byte() +            if flags & 0x08 != 0: +                # Options present +                option_count = u30() +                for c in range(option_count): +                    u30()  # val +                    read_bytes(1)  # kind +            if flags & 0x80 != 0: +                # Param names present +                for _ in range(param_count): +                    u30()  # param name +            mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) +            method_infos.append(mi) + +        # Metadata +        metadata_count = u30() +        for _c in range(metadata_count): +            u30()  # name +            item_count = u30() +            for _c2 in range(item_count): +                u30()  # key +                u30()  # value + +        def parse_traits_info(): +            trait_name_idx = u30() +            kind_full = read_byte() +            kind = kind_full & 0x0f +            attrs = kind_full >> 4 +            methods = {} +            if kind in [0x00, 0x06]:  # Slot or Const +                u30()  # Slot id +                u30()  # type_name_idx +                vindex = u30() +                if vindex != 0: +                    read_byte()  # vkind +            elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter +                u30()  # disp_id +                method_idx = u30() +                methods[multinames[trait_name_idx]] = method_idx +            elif kind == 0x04:  # Class +                u30()  # slot_id +                u30()  # classi +            elif kind == 0x05:  # Function +                u30()  # slot_id +                function_idx = u30() +                methods[function_idx] = multinames[trait_name_idx] +            else: +                raise ExtractorError(u'Unsupported trait kind %d' % kind) + +            if attrs & 0x4 != 0:  # Metadata present +                metadata_count = u30() +                for _c3 in range(metadata_count): +                    u30()  # metadata index + +            return methods + +        # Classes +        TARGET_CLASSNAME = u'SignatureDecipher' +        searched_idx = multinames.index(TARGET_CLASSNAME) +        searched_class_id = None +        class_count = u30() +        for class_id in range(class_count): +            name_idx = u30() +            if name_idx == searched_idx: +                # We found the class we're looking for! +                searched_class_id = class_id +            u30()  # super_name idx +            flags = read_byte() +            if flags & 0x08 != 0:  # Protected namespace is present +                u30()  # protected_ns_idx +            intrf_count = u30() +            for _c2 in range(intrf_count): +                u30() +            u30()  # iinit +            trait_count = u30() +            for _c2 in range(trait_count): +                parse_traits_info() + +        if searched_class_id is None: +            raise ExtractorError(u'Target class %r not found' % +                                 TARGET_CLASSNAME) + +        method_names = {} +        method_idxs = {} +        for class_id in range(class_count): +            u30()  # cinit +            trait_count = u30() +            for _c2 in range(trait_count): +                trait_methods = parse_traits_info() +                if class_id == searched_class_id: +                    method_names.update(trait_methods.items()) +                    method_idxs.update(dict( +                        (idx, name) +                        for name, idx in trait_methods.items())) + +        # Scripts +        script_count = u30() +        for _c in range(script_count): +            u30()  # init +            trait_count = u30() +            for _c2 in range(trait_count): +                parse_traits_info() + +        # Method bodies +        method_body_count = u30() +        Method = collections.namedtuple('Method', ['code', 'local_count']) +        methods = {} +        for _c in range(method_body_count): +            method_idx = u30() +            u30()  # max_stack +            local_count = u30() +            u30()  # init_scope_depth +            u30()  # max_scope_depth +            code_length = u30() +            code = read_bytes(code_length) +            if method_idx in method_idxs: +                m = Method(code, local_count) +                methods[method_idxs[method_idx]] = m +            exception_count = u30() +            for _c2 in range(exception_count): +                u30()  # from +                u30()  # to +                u30()  # target +                u30()  # exc_type +                u30()  # var_name +            trait_count = u30() +            for _c2 in range(trait_count): +                parse_traits_info() + +        assert p + code_reader.tell() == len(code_tag) +        assert len(methods) == len(method_idxs) + +        method_pyfunctions = {} + +        def extract_function(func_name): +            if func_name in method_pyfunctions: +                return method_pyfunctions[func_name] +            if func_name not in methods: +                raise ExtractorError(u'Cannot find function %r' % func_name) +            m = methods[func_name] + +            def resfunc(args): +                registers = ['(this)'] + list(args) + [None] * m.local_count +                stack = [] +                coder = io.BytesIO(m.code) +                while True: +                    opcode = struct.unpack('!B', coder.read(1))[0] +                    if opcode == 36:  # pushbyte +                        v = struct.unpack('!B', coder.read(1))[0] +                        stack.append(v) +                    elif opcode == 44:  # pushstring +                        idx = u30(coder) +                        stack.append(constant_strings[idx]) +                    elif opcode == 48:  # pushscope +                        # We don't implement the scope register, so we'll just +                        # ignore the popped value +                        stack.pop() +                    elif opcode == 70:  # callproperty +                        index = u30(coder) +                        mname = multinames[index] +                        arg_count = u30(coder) +                        args = list(reversed( +                            [stack.pop() for _ in range(arg_count)])) +                        obj = stack.pop() +                        if mname == u'split': +                            assert len(args) == 1 +                            assert isinstance(args[0], compat_str) +                            assert isinstance(obj, compat_str) +                            if args[0] == u'': +                                res = list(obj) +                            else: +                                res = obj.split(args[0]) +                            stack.append(res) +                        elif mname == u'slice': +                            assert len(args) == 1 +                            assert isinstance(args[0], int) +                            assert isinstance(obj, list) +                            res = obj[args[0]:] +                            stack.append(res) +                        elif mname == u'join': +                            assert len(args) == 1 +                            assert isinstance(args[0], compat_str) +                            assert isinstance(obj, list) +                            res = args[0].join(obj) +                            stack.append(res) +                        elif mname in method_pyfunctions: +                            stack.append(method_pyfunctions[mname](args)) +                        else: +                            raise NotImplementedError( +                                u'Unsupported property %r on %r' +                                % (mname, obj)) +                    elif opcode == 72:  # returnvalue +                        res = stack.pop() +                        return res +                    elif opcode == 79:  # callpropvoid +                        index = u30(coder) +                        mname = multinames[index] +                        arg_count = u30(coder) +                        args = list(reversed( +                            [stack.pop() for _ in range(arg_count)])) +                        obj = stack.pop() +                        if mname == u'reverse': +                            assert isinstance(obj, list) +                            obj.reverse() +                        else: +                            raise NotImplementedError( +                                u'Unsupported (void) property %r on %r' +                                % (mname, obj)) +                    elif opcode == 93:  # findpropstrict +                        index = u30(coder) +                        mname = multinames[index] +                        res = extract_function(mname) +                        stack.append(res) +                    elif opcode == 97:  # setproperty +                        index = u30(coder) +                        value = stack.pop() +                        idx = stack.pop() +                        obj = stack.pop() +                        assert isinstance(obj, list) +                        assert isinstance(idx, int) +                        obj[idx] = value +                    elif opcode == 98:  # getlocal +                        index = u30(coder) +                        stack.append(registers[index]) +                    elif opcode == 99:  # setlocal +                        index = u30(coder) +                        value = stack.pop() +                        registers[index] = value +                    elif opcode == 102:  # getproperty +                        index = u30(coder) +                        pname = multinames[index] +                        if pname == u'length': +                            obj = stack.pop() +                            assert isinstance(obj, list) +                            stack.append(len(obj)) +                        else:  # Assume attribute access +                            idx = stack.pop() +                            assert isinstance(idx, int) +                            obj = stack.pop() +                            assert isinstance(obj, list) +                            stack.append(obj[idx]) +                    elif opcode == 128:  # coerce +                        u30(coder) +                    elif opcode == 133:  # coerce_s +                        assert isinstance(stack[-1], (type(None), compat_str)) +                    elif opcode == 164:  # modulo +                        value2 = stack.pop() +                        value1 = stack.pop() +                        res = value1 % value2 +                        stack.append(res) +                    elif opcode == 208:  # getlocal_0 +                        stack.append(registers[0]) +                    elif opcode == 209:  # getlocal_1 +                        stack.append(registers[1]) +                    elif opcode == 210:  # getlocal_2 +                        stack.append(registers[2]) +                    elif opcode == 211:  # getlocal_3 +                        stack.append(registers[3]) +                    elif opcode == 214:  # setlocal_2 +                        registers[2] = stack.pop() +                    elif opcode == 215:  # setlocal_3 +                        registers[3] = stack.pop() +                    else: +                        raise NotImplementedError( +                            u'Unsupported opcode %d' % opcode) + +            method_pyfunctions[func_name] = resfunc +            return resfunc + +        initial_function = extract_function(u'decipher') +        return lambda s: initial_function([s]) + +    def _decrypt_signature(self, s, video_id, player_url, age_gate=False):          """Turn the encrypted s field into a working signature""" -        if len(s) == 92: +        if player_url is not None: +            try: +                if player_url not in self._player_cache: +                    func = self._extract_signature_function( +                        video_id, player_url, len(s) +                    ) +                    self._player_cache[player_url] = func +                func = self._player_cache[player_url] +                if self._downloader.params.get('youtube_print_sig_code'): +                    self._print_sig_code(func, len(s)) +                return func(s) +            except Exception: +                tb = traceback.format_exc() +                self._downloader.report_warning( +                    u'Automatic signature extraction failed: ' + tb) + +            self._downloader.report_warning( +                u'Warning: Falling back to static signature algorithm') + +        return self._static_decrypt_signature( +            s, video_id, player_url, age_gate) + +    def _static_decrypt_signature(self, s, video_id, player_url, age_gate): +        if age_gate: +            # The videos with age protection use another player, so the +            # algorithms can be different. +            if len(s) == 86: +                return s[2:63] + s[82] + s[64:82] + s[63] + +        if len(s) == 93: +            return s[86:29:-1] + s[88] + s[28:5:-1] +        elif len(s) == 92:              return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] +        elif len(s) == 91: +            return s[84:27:-1] + s[86] + s[26:5:-1]          elif len(s) == 90:              return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]          elif len(s) == 89: @@ -426,13 +1078,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          elif len(s) == 87:              return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]          elif len(s) == 86: -            return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] +            return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]          elif len(s) == 85: -            return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1] +            return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]          elif len(s) == 84: -            return s[81:36:-1] + s[0] + s[35:2:-1] +            return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]          elif len(s) == 83: -            return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] +            return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]          elif len(s) == 82:              return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]          elif len(s) == 81: @@ -445,15 +1097,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          else:              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) -    def _decrypt_signature_age_gate(self, s): -        # The videos with age protection use another player, so the algorithms -        # can be different. -        if len(s) == 86: -            return s[2:63] + s[82] + s[64:82] + s[63] -        else: -            # Fallback to the other algortihms -            return self._decrypt_signature(s) -      def _get_available_subtitles(self, video_id):          try:              sub_list = self._download_webpage( @@ -626,7 +1269,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')          # Attempt to extract SWF player URL -        mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) +        mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)          if mobj is not None:              player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))          else: @@ -702,7 +1345,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              video_thumbnail = m_thumb.group(1)          elif 'thumbnail_url' not in video_info:              self._downloader.report_warning(u'unable to extract video thumbnail') -            video_thumbnail = '' +            video_thumbnail = None          else:   # don't panic if we can't find it              video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -779,24 +1422,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                      if 'sig' in url_data:                          url += '&signature=' + url_data['sig'][0]                      elif 's' in url_data: +                        encrypted_sig = url_data['s'][0]                          if self._downloader.params.get('verbose'): -                            s = url_data['s'][0]                              if age_gate: -                                player_version = self._search_regex(r'ad3-(.+?)\.swf', -                                    video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND', -                                    'flash player', fatal=False) -                                player = 'flash player %s' % player_version +                                if player_url is None: +                                    player_version = 'unknown' +                                else: +                                    player_version = self._search_regex( +                                        r'-(.+)\.swf$', player_url, +                                        u'flash player', fatal=False) +                                player_desc = 'flash player %s' % player_version                              else: -                                player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, +                                player_version = self._search_regex( +                                    r'html5player-(.+?)\.js', video_webpage,                                      'html5 player', fatal=False) -                            parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) +                                player_desc = u'html5 player %s' % player_version + +                            parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))                              self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % -                                (len(s), parts_sizes, url_data['itag'][0], player)) -                        encrypted_sig = url_data['s'][0] -                        if age_gate: -                            signature = self._decrypt_signature_age_gate(encrypted_sig) -                        else: -                            signature = self._decrypt_signature(encrypted_sig) +                                (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + +                        if not age_gate: +                            jsplayer_url_json = self._search_regex( +                                r'"assets":.+?"js":\s*("[^"]+")', +                                video_webpage, u'JS player URL') +                            player_url = json.loads(jsplayer_url_json) + +                        signature = self._decrypt_signature( +                            encrypted_sig, video_id, player_url, age_gate)                          url += '&signature=' + signature                      if 'ratebypass' not in url:                          url += '&ratebypass=yes' @@ -812,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  return          else: -            raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') +            raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')          results = []          for format_param, video_real_url in video_url_list: @@ -1007,6 +1660,9 @@ class YoutubeUserIE(InfoExtractor):                  response = json.loads(page)              except ValueError as err:                  raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) +            if 'entry' not in response['feed']: +                # Number of videos is a multiple of self._MAX_RESULTS +                break              # Extract video identifiers              ids_in_page = [] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 418509cb9..faed7ff7f 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -2,16 +2,14 @@ import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      ExtractorError, -    unescapeHTML,  ) +  class ZDFIE(InfoExtractor): -    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' -    _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>' +    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'      _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' -    _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' -    _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -19,6 +17,9 @@ class ZDFIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          video_id = mobj.group('video_id') +        if mobj.group('hash'): +            url = url.replace(u'#', u'', 1) +          html = self._download_webpage(url, video_id)          streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]          if streams is None: @@ -27,39 +28,48 @@ class ZDFIE(InfoExtractor):          # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url          # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url          # choose first/default media type and highest quality for now -        for s in streams:        #find 300 - dsl1000mbit -            if s['quality'] == '300' and s['media_type'] == 'wstreaming': -                stream_=s -                break -        for s in streams:        #find veryhigh - dsl2000mbit -            if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working -                stream_=s -                break -        if stream_ is None: +        def stream_pref(s): +            TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] +            try: +                type_pref = TYPE_ORDER.index(s['media_type']) +            except ValueError: +                type_pref = 999 + +            QUALITY_ORDER = ['veryhigh', '300'] +            try: +                quality_pref = QUALITY_ORDER.index(s['quality']) +            except ValueError: +                quality_pref = 999 + +            return (type_pref, quality_pref) + +        sorted_streams = sorted(streams, key=stream_pref) +        if not sorted_streams:              raise ExtractorError(u'No stream found.') +        stream = sorted_streams[0] -        media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') +        media_link = self._download_webpage( +            stream['video_url'], +            video_id, +            u'Get stream URL') -        self.report_extraction(video_id) -        mobj = re.search(self._TITLE, html) -        if mobj is None: -            raise ExtractorError(u'Cannot extract title') -        title = unescapeHTML(mobj.group('title')) +        MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' +        RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' -        mobj = re.search(self._MMS_STREAM, media_link) +        mobj = re.search(self._MEDIA_STREAM, media_link)          if mobj is None: -            mobj = re.search(self._RTSP_STREAM, media_link) +            mobj = re.search(RTSP_STREAM, media_link)              if mobj is None:                  raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') -        mms_url = mobj.group('video_url') +        video_url = mobj.group('video_url') -        mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) -        if mobj is None: -            raise ExtractorError(u'Cannot extract extention') -        ext = mobj.group('ext') +        title = self._html_search_regex( +            r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', +            html, u'title') -        return [{'id': video_id, -                 'url': mms_url, -                 'title': title, -                 'ext': ext -                 }] +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'ext': determine_ext(video_url) +        } diff --git a/youtube_dl/update.py b/youtube_dl/update.py index ccab6f27f..0689a4891 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -1,6 +1,9 @@ +import io  import json  import traceback  import hashlib +import subprocess +import sys  from zipimport import zipimporter  from .utils import * @@ -34,7 +37,7 @@ def rsa_verify(message, signature, key):      if signature != sha256(message).digest(): return False      return True -def update_self(to_screen, verbose, filename): +def update_self(to_screen, verbose):      """Update the program file with the latest version from the repository"""      UPDATE_URL = "http://rg3.github.io/youtube-dl/update/" @@ -42,7 +45,6 @@ def update_self(to_screen, verbose, filename):      JSON_URL = UPDATE_URL + 'versions.json'      UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) -      if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, "frozen"):          to_screen(u'It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.')          return @@ -75,11 +77,18 @@ def update_self(to_screen, verbose, filename):          to_screen(u'ERROR: the versions file signature is invalid. Aborting.')          return -    to_screen(u'Updating to version ' + versions_info['latest'] + '...') -    version = versions_info['versions'][versions_info['latest']] +    version_id = versions_info['latest'] +    to_screen(u'Updating to version ' + version_id + '...') +    version = versions_info['versions'][version_id]      print_notes(to_screen, versions_info['versions']) +    filename = sys.argv[0] +    # Py2EXE: Filename could be different +    if hasattr(sys, "frozen") and not os.path.isfile(filename): +        if os.path.isfile(filename + u'.exe'): +            filename += u'.exe' +      if not os.access(filename, os.W_OK):          to_screen(u'ERROR: no write permissions on %s' % filename)          return @@ -116,16 +125,18 @@ def update_self(to_screen, verbose, filename):          try:              bat = os.path.join(directory, 'youtube-dl-updater.bat') -            b = open(bat, 'w') -            b.write(""" -echo Updating youtube-dl... +            with io.open(bat, 'w') as batfile: +                batfile.write(u""" +@echo off +echo Waiting for file handle to be closed ...  ping 127.0.0.1 -n 5 -w 1000 > NUL -move /Y "%s.new" "%s" -del "%s" -            \n""" %(exe, exe, bat)) -            b.close() +move /Y "%s.new" "%s" > NUL +echo Updated youtube-dl to version %s. +start /b "" cmd /c del "%%~f0"&exit /b" +                \n""" % (exe, exe, version_id)) -            os.startfile(bat) +            subprocess.Popen([bat])  # Continues to run in the background +            return  # Do not show premature success messages          except (IOError, OSError) as err:              if verbose: to_screen(compat_str(traceback.format_exc()))              to_screen(u'ERROR: unable to overwrite current version') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 768c6207d..201ed255d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -67,6 +67,12 @@ except ImportError:  # Python 2      from urllib2 import HTTPError as compat_HTTPError  try: +    from urllib.request import urlretrieve as compat_urlretrieve +except ImportError:  # Python 2 +    from urllib import urlretrieve as compat_urlretrieve + + +try:      from subprocess import DEVNULL      compat_subprocess_get_DEVNULL = lambda: DEVNULL  except ImportError: @@ -700,7 +706,16 @@ def unified_strdate(date_str):      date_str = date_str.replace(',',' ')      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) -    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] +    format_expressions = [ +        '%d %B %Y', +        '%B %d %Y', +        '%b %d %Y', +        '%Y-%m-%d', +        '%d/%m/%Y', +        '%Y/%m/%d %H:%M:%S', +        '%d.%m.%Y %H:%M', +        '%Y-%m-%dT%H:%M:%SZ', +    ]      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -781,6 +796,18 @@ def platform_name():      return res +def write_string(s, out=None): +    if out is None: +        out = sys.stderr +    assert type(s) == type(u'') + +    if ('b' in getattr(out, 'mode', '') or +            sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr +        s = s.encode(preferredencoding(), 'ignore') +    out.write(s) +    out.flush() + +  def bytes_to_intlist(bs):      if not bs:          return [] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3b2505c77..e3e5d5538 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.09.12' +__version__ = '2013.09.29' | 
