diff options
| author | Pierre Rudloff <contact@rudloff.pro> | 2013-08-31 00:37:29 +0200 | 
|---|---|---|
| committer | Pierre Rudloff <contact@rudloff.pro> | 2013-08-31 00:37:29 +0200 | 
| commit | 847f582290c6ad6ec0c72760ea3cfa6417d28e3c (patch) | |
| tree | c12baa61c649ec00e5fdfe35b1f88f0025d54f99 | |
| parent | cd9c100963e8b8bf651d1f359e5f7812603ca0fd (diff) | |
| parent | 10f5c016ec6262e5d29327e97fe4f3d1127ccdff (diff) | |
Merge remote-tracking branch 'upstream/master'
| -rw-r--r-- | devscripts/bash-completion.in | 6 | ||||
| -rw-r--r-- | devscripts/youtube_genalgo.py | 8 | ||||
| -rw-r--r-- | test/test_download.py | 7 | ||||
| -rw-r--r-- | youtube_dl/PostProcessor.py | 4 | ||||
| -rw-r--r-- | youtube_dl/YoutubeDL.py | 18 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 19 | ||||
| -rw-r--r-- | youtube_dl/aes.py | 202 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/addanime.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/appletrailers.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/ign.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/kankan.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/mit.py | 74 | ||||
| -rw-r--r-- | youtube_dl/extractor/orf.py | 67 | ||||
| -rw-r--r-- | youtube_dl/extractor/sohu.py | 90 | ||||
| -rw-r--r-- | youtube_dl/extractor/trilulilu.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/unistra.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/wat.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/youporn.py | 18 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 8 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 37 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
24 files changed, 557 insertions, 54 deletions
diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index 3b99a9614..bd10f63c2 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -4,8 +4,12 @@ __youtube-dl()      COMPREPLY=()      cur="${COMP_WORDS[COMP_CWORD]}"      opts="{{flags}}" +    keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater" -    if [[ ${cur} == * ]] ; then +    if [[ ${cur} =~ : ]]; then +        COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) ) +        return 0 +    elif [[ ${cur} == * ]] ; then          COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )          return 0      fi diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 917e8f79d..97a0d7290 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -20,15 +20,15 @@ tests = [      # 87      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<",       "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), -    # 86 - vflh9ybst 2013/08/23 +    # 86 - vflHOr_nV 2013/08/30      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", -     "yuioplkjhgfdsazxcvbnm1234567890QWERrYUIOPLKqHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"), +     "?;}|[{=+._)(*&^%$#@!MNBqCXZASDFGHJKLPOIUYTREWQ<987654321mnbvcxzasdfghjklpoiuytrew"),      # 85      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<",       ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"), -    # 84 - vflh9ybst 2013/08/23 (sporadic) +    # 84 - vflg0g8PQ 2013/08/29 (sporadic)      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", -     "yuioplkjhgfdsazxcvbnm1234567890QWERrYUIOPLKqHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<"), +     ">?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWq0987654321mnbvcxzasdfghjklpoiuytr"),      # 83      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",       ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), diff --git a/test/test_download.py b/test/test_download.py index 21cb2e694..23a66254d 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -127,12 +127,11 @@ def generator(test_case):                      info_dict = json.load(infof)                  for (info_field, expected) in tc.get('info_dict', {}).items():                      if isinstance(expected, compat_str) and expected.startswith('md5:'): -                        self.assertEqual(expected, 'md5:' + md5(info_dict.get(info_field))) +                        got = 'md5:' + md5(info_dict.get(info_field))                      else:                          got = info_dict.get(info_field) -                        self.assertEqual( -                            expected, got, -                            u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) +                    self.assertEqual(expected, got, +                        u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))                  # If checkable fields are missing from the test case, print the info_dict                  test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index c02ed7148..ae56d2082 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -137,7 +137,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):          try:              FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts)          except FFmpegPostProcessorError as err: -            raise AudioConversionError(err.message) +            raise AudioConversionError(err.msg)      def run(self, information):          path = information['filepath'] @@ -207,7 +207,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):          except:              etype,e,tb = sys.exc_info()              if isinstance(e, AudioConversionError): -                msg = u'audio conversion failed: ' + e.message +                msg = u'audio conversion failed: ' + e.msg              else:                  msg = u'error running ' + (self._exes['avconv'] and 'avconv' or 'ffmpeg')              raise PostProcessingError(msg) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d5f7c81eb..b289bd9e2 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,6 +97,7 @@ class YoutubeDL(object):      def __init__(self, params):          """Create a FileDownloader object with the given options."""          self._ies = [] +        self._ies_instances = {}          self._pps = []          self._progress_hooks = []          self._download_retcode = 0 @@ -111,8 +112,21 @@ class YoutubeDL(object):      def add_info_extractor(self, ie):          """Add an InfoExtractor object to the end of the list."""          self._ies.append(ie) +        self._ies_instances[ie.ie_key()] = ie          ie.set_downloader(self) +    def get_info_extractor(self, ie_key): +        """ +        Get an instance of an IE with name ie_key, it will try to get one from +        the _ies list, if there's no instance it will create a new one and add +        it to the extractor list. +        """ +        ie = self._ies_instances.get(ie_key) +        if ie is None: +            ie = get_info_extractor(ie_key)() +            self.add_info_extractor(ie) +        return ie +      def add_default_info_extractors(self):          """          Add the InfoExtractors returned by gen_extractors to the end of the list @@ -294,9 +308,7 @@ class YoutubeDL(object):           '''          if ie_key: -            ie = get_info_extractor(ie_key)() -            ie.set_downloader(self) -            ies = [ie] +            ies = [self.get_info_extractor(ie_key)]          else:              ies = self._ies diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index bc6a6d180..431460c57 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -45,6 +45,7 @@ import sys  import warnings  import platform +  from .utils import *  from .update import update_self  from .version import __version__ @@ -99,6 +100,16 @@ def parseOpts(overrideArguments=None):              pass          return None +    def _hide_login_info(opts): +        opts = list(opts) +        for private_opt in ['-p', '--password', '-u', '--username']: +            try: +                i = opts.index(private_opt) +                opts[i+1] = '<PRIVATE>' +            except ValueError: +                pass +        return opts +      max_width = 80      max_help_position = 80 @@ -357,9 +368,9 @@ def parseOpts(overrideArguments=None):          argv = systemConf + userConf + commandLineConf          opts, args = parser.parse_args(argv)          if opts.verbose: -            sys.stderr.write(u'[debug] System config: ' + repr(systemConf) + '\n') -            sys.stderr.write(u'[debug] User config: ' + repr(userConf) + '\n') -            sys.stderr.write(u'[debug] Command-line args: ' + repr(commandLineConf) + '\n') +            sys.stderr.write(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') +            sys.stderr.write(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') +            sys.stderr.write(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')      return parser, opts, args @@ -611,7 +622,7 @@ def _real_main(argv=None):                  sys.exc_clear()              except:                  pass -        sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n') +        sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n')          sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n')      ydl.add_default_info_extractors() diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py new file mode 100644 index 000000000..9a0c93fa6 --- /dev/null +++ b/youtube_dl/aes.py @@ -0,0 +1,202 @@ +__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text'] + +import base64 +from math import ceil + +from .utils import bytes_to_intlist, intlist_to_bytes + +BLOCK_SIZE_BYTES = 16 + +def aes_ctr_decrypt(data, key, counter): +    """ +    Decrypt with aes in counter mode +     +    @param {int[]} data        cipher +    @param {int[]} key         16/24/32-Byte cipher key +    @param {instance} counter  Instance whose next_value function (@returns {int[]}  16-Byte block) +                               returns the next counter block +    @returns {int[]}           decrypted data +    """ +    expanded_key = key_expansion(key) +    block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) +     +    decrypted_data=[] +    for i in range(block_count): +        counter_block = counter.next_value() +        block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES] +        block += [0]*(BLOCK_SIZE_BYTES - len(block)) +         +        cipher_counter_block = aes_encrypt(counter_block, expanded_key) +        decrypted_data += xor(block, cipher_counter_block) +    decrypted_data = decrypted_data[:len(data)] +     +    return decrypted_data + +def key_expansion(data): +    """ +    Generate key schedule +     +    @param {int[]} data  16/24/32-Byte cipher key +    @returns {int[]}     176/208/240-Byte expanded key  +    """ +    data = data[:] # copy +    rcon_iteration = 1 +    key_size_bytes = len(data) +    expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES +     +    while len(data) < expanded_key_size_bytes: +        temp = data[-4:] +        temp = key_schedule_core(temp, rcon_iteration) +        rcon_iteration += 1 +        data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +         +        for _ in range(3): +            temp = data[-4:] +            data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +         +        if key_size_bytes == 32: +            temp = data[-4:] +            temp = sub_bytes(temp) +            data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +         +        for _ in range(3 if key_size_bytes == 32  else 2 if key_size_bytes == 24 else 0): +            temp = data[-4:] +            data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +    data = data[:expanded_key_size_bytes] +     +    return data + +def aes_encrypt(data, expanded_key): +    """ +    Encrypt one block with aes +     +    @param {int[]} data          16-Byte state +    @param {int[]} expanded_key  176/208/240-Byte expanded key  +    @returns {int[]}             16-Byte cipher +    """ +    rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 +     +    data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) +    for i in range(1, rounds+1): +        data = sub_bytes(data) +        data = shift_rows(data) +        if i != rounds: +            data = mix_columns(data) +        data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) +     +    return data + +def aes_decrypt_text(data, password, key_size_bytes): +    """ +    Decrypt text +    - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter +    - The cipher key is retrieved by encrypting the first 16 Byte of 'password' +      with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) +    - Mode of operation is 'counter' +     +    @param {str} data                    Base64 encoded string +    @param {str,unicode} password        Password (will be encoded with utf-8) +    @param {int} key_size_bytes          Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit +    @returns {str}                       Decrypted data +    """ +    NONCE_LENGTH_BYTES = 8 +     +    data = bytes_to_intlist(base64.b64decode(data)) +    password = bytes_to_intlist(password.encode('utf-8')) +     +    key = password[:key_size_bytes] + [0]*(key_size_bytes - len(password)) +    key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) +     +    nonce = data[:NONCE_LENGTH_BYTES] +    cipher = data[NONCE_LENGTH_BYTES:] +     +    class Counter: +        __value = nonce + [0]*(BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) +        def next_value(self): +            temp = self.__value +            self.__value = inc(self.__value) +            return temp +     +    decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) +    plaintext = intlist_to_bytes(decrypted_data) +     +    return plaintext + +RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) +SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, +        0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, +        0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, +        0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, +        0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, +        0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, +        0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, +        0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, +        0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, +        0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, +        0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, +        0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, +        0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, +        0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, +        0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, +        0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) +MIX_COLUMN_MATRIX = ((2,3,1,1), +                     (1,2,3,1), +                     (1,1,2,3), +                     (3,1,1,2)) + +def sub_bytes(data): +    return [SBOX[x] for x in data] + +def rotate(data): +    return data[1:] + [data[0]] + +def key_schedule_core(data, rcon_iteration): +    data = rotate(data) +    data = sub_bytes(data) +    data[0] = data[0] ^ RCON[rcon_iteration] +     +    return data + +def xor(data1, data2): +    return [x^y for x, y in zip(data1, data2)] + +def mix_column(data): +    data_mixed = [] +    for row in range(4): +        mixed = 0 +        for column in range(4): +            addend = data[column] +            if MIX_COLUMN_MATRIX[row][column] in (2,3): +                addend <<= 1 +                if addend > 0xff: +                    addend &= 0xff +                    addend ^= 0x1b +                if MIX_COLUMN_MATRIX[row][column] == 3: +                    addend ^= data[column] +            mixed ^= addend & 0xff +        data_mixed.append(mixed) +    return data_mixed + +def mix_columns(data): +    data_mixed = [] +    for i in range(4): +        column = data[i*4 : (i+1)*4] +        data_mixed += mix_column(column) +    return data_mixed + +def shift_rows(data): +    data_shifted = [] +    for column in range(4): +        for row in range(4): +            data_shifted.append( data[((column + row) & 0b11) * 4 + row] ) +    return data_shifted + +def inc(data): +    data = data[:] # copy +    for i in range(len(data)-1,-1,-1): +        if data[i] == 255: +            data[i] = 0 +        else: +            data[i] = data[i] + 1 +            break +    return data diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c76b99a81..90f1a4418 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -50,6 +50,7 @@ from .keek import KeekIE  from .liveleak import LiveLeakIE  from .livestream import LivestreamIE  from .metacafe import MetacafeIE +from .mit import TechTVMITIE, MITIE  from .mixcloud import MixcloudIE  from .mtv import MTVIE  from .muzu import MuzuTVIE @@ -58,6 +59,7 @@ from .myvideo import MyVideoIE  from .nba import NBAIE  from .nbc import NBCNewsIE  from .ooyala import OoyalaIE +from .orf import ORFIE  from .pbs import PBSIE  from .photobucket import PhotobucketIE  from .pornotube import PornotubeIE @@ -69,6 +71,7 @@ from .roxwel import RoxwelIE  from .rtlnow import RTLnowIE  from .sina import SinaIE  from .slashdot import SlashdotIE +from .sohu import SohuIE  from .soundcloud import SoundcloudIE, SoundcloudSetIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 46db8262f..82a785a19 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -1,4 +1,3 @@ -import ast  import re  from .common import InfoExtractor diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index b3bdb2955..8b191c196 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -4,7 +4,6 @@ import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import (      determine_ext, -    ExtractorError,  ) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 12169b2bb..77726ee24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -114,6 +114,11 @@ class InfoExtractor(object):          """Real extraction process. Redefine in subclasses."""          pass +    @classmethod +    def ie_key(cls): +        """A string for getting the InfoExtractor with get_info_extractor""" +        return cls.__name__[:-2] +      @property      def IE_NAME(self):          return type(self).__name__[:-2] @@ -140,12 +145,17 @@ class InfoExtractor(object):          urlh = self._request_webpage(url_or_request, video_id, note, errnote)          content_type = urlh.headers.get('Content-Type', '') +        webpage_bytes = urlh.read()          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)          if m:              encoding = m.group(1)          else: -            encoding = 'utf-8' -        webpage_bytes = urlh.read() +            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', +                          webpage_bytes[:1024]) +            if m: +                encoding = m.group(1).decode('ascii') +            else: +                encoding = 'utf-8'          if self._downloader.params.get('dump_intermediate_pages', False):              try:                  url = url_or_request.get_full_url() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bfc9bff49..dc4dea4ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,8 +7,8 @@ from .common import InfoExtractor  from ..utils import (      compat_urllib_error,      compat_urllib_parse, -    compat_urllib_parse_urlparse,      compat_urllib_request, +    compat_urlparse,      ExtractorError,  ) @@ -163,15 +163,7 @@ class GenericIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          video_url = compat_urllib_parse.unquote(mobj.group(1)) -        if video_url.startswith('//'): -            video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url -        if '://' not in video_url: -            up = compat_urllib_parse_urlparse(url) -            if video_url.startswith('/'): -                video_url = up.scheme + '://' + up.netloc + video_url -            else:  # relative path -                video_url = (up.scheme + '://' + up.netloc + -                             up.path.rpartition('/')[0] + '/' + video_url) +        video_url = compat_urlparse.urljoin(url, video_url)          video_id = os.path.basename(video_url)          # here's a fun little line of code for you: diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 62abab655..b1c84278a 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -13,7 +13,7 @@ class IGNIE(InfoExtractor):      Some videos of it.ign.com are also supported      """ -    _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P<name_or_id>.+)' +    _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles)(/.+)?/(?P<name_or_id>.+)'      IE_NAME = u'ign.com'      _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' @@ -41,7 +41,11 @@ class IGNIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          name_or_id = mobj.group('name_or_id') +        page_type = mobj.group('type')          webpage = self._download_webpage(url, name_or_id) +        if page_type == 'articles': +            video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url') +            return self.url_result(video_url, ie='IGN')          video_id = self._find_video_id(webpage)          result = self._get_video_info(video_id)          description = self._html_search_regex(self._DESCRIPTION_RE, @@ -68,7 +72,7 @@ class IGNIE(InfoExtractor):  class OneUPIE(IGNIE):      """Extractor for 1up.com, it uses the ign videos system.""" -    _VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P<name_or_id>.+)' +    _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)'      IE_NAME = '1up.com'      _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 8537ba584..445d46501 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -21,8 +21,10 @@ class KankanIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') -        gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') +        title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title') +        surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) +        gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls) +        gcid = gcids[-1]          video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,                                                   video_id, u'Downloading video url info') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..52be9232f --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,74 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): +    IE_NAME = u'techtv.mit.edu' +    _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', +        u'file': u'25418.mp4', +        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', +        u'info_dict': { +            u'title': u'MIT DNA Learning Center Set', +            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        raw_page = self._download_webpage( +            'http://techtv.mit.edu/videos/%s' % video_id, video_id) +        clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page) + +        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', +            raw_page, u'base url') +        formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, +            u'video formats') +        formats = json.loads(formats_json) +        formats = sorted(formats, key=lambda f: f['bitrate']) + +        title = get_element_by_id('edit-title', clean_page) +        description = clean_html(get_element_by_id('edit-description', clean_page)) +        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', +            raw_page, u'thumbnail', flags=re.DOTALL) + +        return {'id': video_id, +                'title': title, +                'url': base_url + formats[-1]['url'].replace('mp4:', ''), +                'ext': 'mp4', +                'description': description, +                'thumbnail': thumbnail, +                } + + +class MITIE(TechTVMITIE): +    IE_NAME = u'video.mit.edu' +    _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' + +    _TEST = { +        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', +        u'file': u'21783.mp4', +        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', +        u'info_dict': { +            u'title': u'The Government is Profiling You', +            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        page_title = mobj.group('title') +        webpage = self._download_webpage(url, page_title) +        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) +        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, +            u'embed url') +        return self.url_result(embed_url, ie='TechTVMIT') diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py new file mode 100644 index 000000000..41ef8e992 --- /dev/null +++ b/youtube_dl/extractor/orf.py @@ -0,0 +1,67 @@ +# coding: utf-8 + +import re +import xml.etree.ElementTree +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_urlparse, +    ExtractorError, +    find_xpath_attr, +) + +class ORFIE(InfoExtractor): +    _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter', +        u'file': u'6566957.flv', +        u'info_dict': { +            u'title': u'Wetter', +            u'description': u'Christa Kummer, Marcus Wadsak und Kollegen  präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at', +        }, +        u'params': { +            # It uses rtmp +            u'skip_download': True, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        playlist_id = mobj.group('id') +        webpage = self._download_webpage(url, playlist_id) + +        flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') +        flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] +        flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) +        playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') +        playlist = json.loads(playlist_json) + +        videos = [] +        ns = '{http://tempuri.org/XMLSchema.xsd}' +        xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} +        webpage_description = self._og_search_description(webpage) +        for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): +            # Get best quality url +            rtmp_url = None +            for q in ['Q6A', 'Q4A', 'Q1A']: +                video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) +                if video_url is not None: +                    rtmp_url = video_url.text +                    break +            if rtmp_url is None: +                raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) +            description = self._html_search_regex( +                r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage, +                u'description', default=webpage_description, flags=re.DOTALL) +            videos.append({ +                '_type': 'video', +                'id': info['id'], +                'title': info['title'], +                'url': rtmp_url, +                'ext': 'flv', +                'description': description, +                }) + +        return videos diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 000000000..77bb0a8dc --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,90 @@ +# encoding: utf-8 + +import json +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SohuIE(InfoExtractor): +    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?' + +    _TEST = { +        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', +        u'file': u'382479172.mp4', +        u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', +        u'info_dict': { +            u'title': u'MV:Far East Movement《The Illest》', +        }, +    } + +    def _real_extract(self, url): + +        def _fetch_data(vid_id): +            base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' +            data_url = base_data_url + str(vid_id) +            data_json = self._download_webpage( +                data_url, video_id, +                note=u'Downloading JSON data for ' + str(vid_id)) +            return json.loads(data_json) + +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>', +                                            webpage, u'video title') +        title = raw_title.partition('-')[0].strip() + +        vid = self._html_search_regex(r'var vid="(\d+)"', webpage, +                                      u'video path') +        data = _fetch_data(vid) + +        QUALITIES = ('ori', 'super', 'high', 'nor') +        vid_ids = [data['data'][q + 'Vid'] +                   for q in QUALITIES +                   if data['data'][q + 'Vid'] != 0] +        if not vid_ids: +            raise ExtractorError(u'No formats available for this video') + +        # For now, we just pick the highest available quality +        vid_id = vid_ids[-1] + +        format_data = data if vid == vid_id else _fetch_data(vid_id) +        part_count = format_data['data']['totalBlocks'] +        allot = format_data['allot'] +        prot = format_data['prot'] +        clipsURL = format_data['data']['clipsURL'] +        su = format_data['data']['su'] + +        playlist = [] +        for i in range(part_count): +            part_url = ('http://%s/?prot=%s&file=%s&new=%s' % +                        (allot, prot, clipsURL[i], su[i])) +            part_str = self._download_webpage( +                part_url, video_id, +                note=u'Downloading part %d of %d' % (i+1, part_count)) + +            part_info = part_str.split('|') +            video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + +            video_info = { +                'id': '%s_part%02d' % (video_id, i + 1), +                'title': title, +                'url': video_url, +                'ext': 'mp4', +            } +            playlist.append(video_info) + +        if len(playlist) == 1: +            info = playlist[0] +            info['id'] = video_id +        else: +            info = { +                '_type': 'playlist', +                'entries': playlist, +                'id': video_id, +            } + +        return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 1c46156c7..f278951ba 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -3,9 +3,6 @@ import re  import xml.etree.ElementTree  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -)  class TriluliluIE(InfoExtractor): diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 5ba0a9061..516e18914 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -11,7 +11,7 @@ class UnistraIE(InfoExtractor):          u'md5': u'736f605cfdc96724d55bb543ab3ced24',          u'info_dict': {              u'title': u'M!ss Yella', -            u'description': u'md5:75e8439a3e2981cd5d4b6db232e8fdfc', +            u'description': u'md5:104892c71bd48e55d70b902736b81bbf',          },      } diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7d228edac..29c25f0e3 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -6,7 +6,6 @@ import re  from .common import InfoExtractor  from ..utils import ( -    compat_urllib_parse,      unified_strdate,  ) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d1156bf42..c85fd4b5a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -12,14 +12,16 @@ from ..utils import (      unescapeHTML,      unified_strdate,  ) - +from ..aes import ( +    aes_decrypt_text +)  class YouPornIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'      _TEST = {          u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',          u'file': u'505835.mp4', -        u'md5': u'c37ddbaaa39058c76a7e86c6813423c1', +        u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',          u'info_dict': {              u"upload_date": u"20101221",               u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",  @@ -75,7 +77,15 @@ class YouPornIE(InfoExtractor):          # Get all of the links from the page          LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'          links = re.findall(LINK_RE, download_list_html) -        if(len(links) == 0): +         +        # Get link of hd video if available +        mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage) +        if mobj != None: +            encrypted_video_url = mobj.group(u'encrypted_video_url') +            video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') +            links = [video_url] + links +         +        if not links:              raise ExtractorError(u'ERROR: no known formats available for video')          self.to_screen(u'Links found: %d' % len(links)) @@ -112,7 +122,7 @@ class YouPornIE(InfoExtractor):              self._print_formats(formats)              return -        req_format = self._downloader.params.get('format', None) +        req_format = self._downloader.params.get('format', 'best')          self.to_screen(u'Format: %s' % req_format)          if req_format is None or req_format == 'best': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e486afd0..810ce6f5d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -335,7 +335,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              u"info_dict": {                  u"upload_date": u"20120506",                  u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", -                u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", +                u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",                  u"uploader": u"Icona Pop",                  u"uploader_id": u"IconaPop"              } @@ -423,11 +423,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          elif len(s) == 87:              return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]          elif len(s) == 86: -            return s[5:40] + s[3] + s[41:48] + s[0] + s[49:86] +            return s[81:73:-1] + s[84] + s[72:58:-1] + s[0] + s[57:35:-1] + s[85] + s[34:0:-1]          elif len(s) == 85:              return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27]          elif len(s) == 84: -            return s[5:40] + s[3] + s[41:48] + s[0] + s[49:84] +            return s[81:36:-1] + s[0] + s[35:2:-1]          elif len(s) == 83:              return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]          elif len(s) == 82: @@ -1161,7 +1161,7 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):      IE_NAME = u'youtube:favorites'      IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' +    _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'      _LOGIN_REQUIRED = True      def _real_extract(self, url): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index be788cf5a..201802cee 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,19 +1,20 @@  #!/usr/bin/env python  # -*- coding: utf-8 -*- +import datetime +import email.utils  import errno  import gzip  import io  import json  import locale  import os +import platform  import re +import socket  import sys  import traceback  import zlib -import email.utils -import socket -import datetime  try:      import urllib.request as compat_urllib_request @@ -212,7 +213,7 @@ if sys.version_info >= (2,7):      def find_xpath_attr(node, xpath, key, val):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z]+$', key) -        assert re.match(r'^[a-zA-Z@\s]*$', val) +        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)          expr = xpath + u"[@%s='%s']" % (key, val)          return node.find(expr)  else: @@ -732,3 +733,31 @@ class DateRange(object):          return self.start <= date <= self.end      def __str__(self):          return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) + + +def platform_name(): +    """ Returns the platform name as a compat_str """ +    res = platform.platform() +    if isinstance(res, bytes): +        res = res.decode(preferredencoding()) + +    assert isinstance(res, compat_str) +    return res + + +def bytes_to_intlist(bs): +    if not bs: +        return [] +    if isinstance(bs[0], int):  # Python 3 +        return list(bs) +    else: +        return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): +    if not xs: +        return b'' +    if isinstance(chr(0), bytes):  # Python 2 +        return ''.join([chr(x) for x in xs]) +    else: +        return bytes(xs) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0b56e48dc..b6284c6d6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.28' +__version__ = '2013.08.30'  | 
