diff options
| author | Allan Zhou <allanzp@gmail.com> | 2013-08-28 09:57:28 -0700 | 
|---|---|---|
| committer | Allan Zhou <allanzp@gmail.com> | 2013-08-28 09:57:28 -0700 | 
| commit | 591078babff1d783bed872c5b441dc570d354448 (patch) | |
| tree | 6cf813c993ea26eaec1d976b616d973c2bc47583 | |
| parent | 99859d436cdee9acc9c869254e734eba5b748260 (diff) | |
| parent | 9868c781a1bb3f50385bc7d1e87d82080ffffbc6 (diff) | |
Merge remote-tracking branch 'upstream/master'
26 files changed, 989 insertions, 78 deletions
diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 6f1d6ef99..917e8f79d 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -14,9 +14,9 @@ tests = [      # 89       ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'",       "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), -    # 88 +    # 88 - vflapUV9V 2013/08/28      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", -     "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), +     "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"),      # 87      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<",       "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 217c4a52f..7c5ac4bc2 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -64,6 +64,17 @@ class FileDownloader(object):          return '%.2f%s' % (converted, suffix)      @staticmethod +    def format_seconds(seconds): +        (mins, secs) = divmod(seconds, 60) +        (hours, eta_mins) = divmod(mins, 60) +        if hours > 99: +            return '--:--:--' +        if hours == 0: +            return '%02d:%02d' % (mins, secs) +        else: +            return '%02d:%02d:%02d' % (hours, mins, secs) + +    @staticmethod      def calc_percent(byte_counter, data_len):          if data_len is None:              return '---.-%' @@ -78,14 +89,7 @@ class FileDownloader(object):              return '--:--'          rate = float(current) / dif          eta = int((float(total) - float(current)) / rate) -        (eta_mins, eta_secs) = divmod(eta, 60) -        (eta_hours, eta_mins) = divmod(eta_mins, 60) -        if eta_hours > 99: -            return '--:--:--' -        if eta_hours == 0: -            return '%02d:%02d' % (eta_mins, eta_secs) -        else: -            return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs) +        return FileDownloader.format_seconds(eta)      @staticmethod      def calc_speed(start, now, bytes): @@ -234,12 +238,14 @@ class FileDownloader(object):          """Report it was impossible to resume download."""          self.to_screen(u'[download] Unable to resume') -    def report_finish(self): +    def report_finish(self, data_len_str, tot_time):          """Report download finished."""          if self.params.get('noprogress', False):              self.to_screen(u'[download] Download completed')          else: -            self.to_screen(u'') +            clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') +            self.to_screen(u'\r%s[download] 100%% of %s in %s' % +                (clear_line, data_len_str, self.format_seconds(tot_time)))      def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url):          self.report_destination(filename) @@ -542,7 +548,7 @@ class FileDownloader(object):              self.report_error(u'Did not get any data blocks')              return False          stream.close() -        self.report_finish() +        self.report_finish(data_len_str, (time.time() - start))          if data_len is not None and byte_counter != data_len:              raise ContentTooShortError(byte_counter, int(data_len))          self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index c02ed7148..ae56d2082 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -137,7 +137,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):          try:              FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts)          except FFmpegPostProcessorError as err: -            raise AudioConversionError(err.message) +            raise AudioConversionError(err.msg)      def run(self, information):          path = information['filepath'] @@ -207,7 +207,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):          except:              etype,e,tb = sys.exc_info()              if isinstance(e, AudioConversionError): -                msg = u'audio conversion failed: ' + e.message +                msg = u'audio conversion failed: ' + e.msg              else:                  msg = u'error running ' + (self._exes['avconv'] and 'avconv' or 'ffmpeg')              raise PostProcessingError(msg) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3fc4ec378..b289bd9e2 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,6 +97,7 @@ class YoutubeDL(object):      def __init__(self, params):          """Create a FileDownloader object with the given options."""          self._ies = [] +        self._ies_instances = {}          self._pps = []          self._progress_hooks = []          self._download_retcode = 0 @@ -111,8 +112,21 @@ class YoutubeDL(object):      def add_info_extractor(self, ie):          """Add an InfoExtractor object to the end of the list."""          self._ies.append(ie) +        self._ies_instances[ie.ie_key()] = ie          ie.set_downloader(self) +    def get_info_extractor(self, ie_key): +        """ +        Get an instance of an IE with name ie_key, it will try to get one from +        the _ies list, if there's no instance it will create a new one and add +        it to the extractor list. +        """ +        ie = self._ies_instances.get(ie_key) +        if ie is None: +            ie = get_info_extractor(ie_key)() +            self.add_info_extractor(ie) +        return ie +      def add_default_info_extractors(self):          """          Add the InfoExtractors returned by gen_extractors to the end of the list @@ -294,9 +308,7 @@ class YoutubeDL(object):           '''          if ie_key: -            ie = get_info_extractor(ie_key)() -            ie.set_downloader(self) -            ies = [ie] +            ies = [self.get_info_extractor(ie_key)]          else:              ies = self._ies @@ -448,7 +460,8 @@ class YoutubeDL(object):          if self.params.get('forceid', False):              compat_print(info_dict['id'])          if self.params.get('forceurl', False): -            compat_print(info_dict['url']) +            # For RTMP URLs, also include the playpath +            compat_print(info_dict['url'] + info_dict.get('play_path', u''))          if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:              compat_print(info_dict['thumbnail'])          if self.params.get('forcedescription', False) and 'description' in info_dict: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 614429073..431460c57 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -45,6 +45,7 @@ import sys  import warnings  import platform +  from .utils import *  from .update import update_self  from .version import __version__ @@ -99,6 +100,16 @@ def parseOpts(overrideArguments=None):              pass          return None +    def _hide_login_info(opts): +        opts = list(opts) +        for private_opt in ['-p', '--password', '-u', '--username']: +            try: +                i = opts.index(private_opt) +                opts[i+1] = '<PRIVATE>' +            except ValueError: +                pass +        return opts +      max_width = 80      max_help_position = 80 @@ -357,9 +368,9 @@ def parseOpts(overrideArguments=None):          argv = systemConf + userConf + commandLineConf          opts, args = parser.parse_args(argv)          if opts.verbose: -            sys.stderr.write(u'[debug] System config: ' + repr(systemConf) + '\n') -            sys.stderr.write(u'[debug] User config: ' + repr(userConf) + '\n') -            sys.stderr.write(u'[debug] Command-line args: ' + repr(commandLineConf) + '\n') +            sys.stderr.write(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') +            sys.stderr.write(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') +            sys.stderr.write(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')      return parser, opts, args @@ -430,6 +441,10 @@ def _real_main(argv=None):      proxy_handler = compat_urllib_request.ProxyHandler(proxies)      https_handler = make_HTTPS_handler(opts)      opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) +    # Delete the default user-agent header, which would otherwise apply in +    # cases where our custom HTTP handler doesn't come into play +    # (See https://github.com/rg3/youtube-dl/issues/1309 for details) +    opener.addheaders =[]      compat_urllib_request.install_opener(opener)      socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) @@ -607,7 +622,7 @@ def _real_main(argv=None):                  sys.exc_clear()              except:                  pass -        sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n') +        sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n')          sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n')      ydl.add_default_info_extractors() diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py new file mode 100644 index 000000000..9a0c93fa6 --- /dev/null +++ b/youtube_dl/aes.py @@ -0,0 +1,202 @@ +__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text'] + +import base64 +from math import ceil + +from .utils import bytes_to_intlist, intlist_to_bytes + +BLOCK_SIZE_BYTES = 16 + +def aes_ctr_decrypt(data, key, counter): +    """ +    Decrypt with aes in counter mode +     +    @param {int[]} data        cipher +    @param {int[]} key         16/24/32-Byte cipher key +    @param {instance} counter  Instance whose next_value function (@returns {int[]}  16-Byte block) +                               returns the next counter block +    @returns {int[]}           decrypted data +    """ +    expanded_key = key_expansion(key) +    block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) +     +    decrypted_data=[] +    for i in range(block_count): +        counter_block = counter.next_value() +        block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES] +        block += [0]*(BLOCK_SIZE_BYTES - len(block)) +         +        cipher_counter_block = aes_encrypt(counter_block, expanded_key) +        decrypted_data += xor(block, cipher_counter_block) +    decrypted_data = decrypted_data[:len(data)] +     +    return decrypted_data + +def key_expansion(data): +    """ +    Generate key schedule +     +    @param {int[]} data  16/24/32-Byte cipher key +    @returns {int[]}     176/208/240-Byte expanded key  +    """ +    data = data[:] # copy +    rcon_iteration = 1 +    key_size_bytes = len(data) +    expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES +     +    while len(data) < expanded_key_size_bytes: +        temp = data[-4:] +        temp = key_schedule_core(temp, rcon_iteration) +        rcon_iteration += 1 +        data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +         +        for _ in range(3): +            temp = data[-4:] +            data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +         +        if key_size_bytes == 32: +            temp = data[-4:] +            temp = sub_bytes(temp) +            data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +         +        for _ in range(3 if key_size_bytes == 32  else 2 if key_size_bytes == 24 else 0): +            temp = data[-4:] +            data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) +    data = data[:expanded_key_size_bytes] +     +    return data + +def aes_encrypt(data, expanded_key): +    """ +    Encrypt one block with aes +     +    @param {int[]} data          16-Byte state +    @param {int[]} expanded_key  176/208/240-Byte expanded key  +    @returns {int[]}             16-Byte cipher +    """ +    rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 +     +    data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) +    for i in range(1, rounds+1): +        data = sub_bytes(data) +        data = shift_rows(data) +        if i != rounds: +            data = mix_columns(data) +        data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) +     +    return data + +def aes_decrypt_text(data, password, key_size_bytes): +    """ +    Decrypt text +    - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter +    - The cipher key is retrieved by encrypting the first 16 Byte of 'password' +      with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) +    - Mode of operation is 'counter' +     +    @param {str} data                    Base64 encoded string +    @param {str,unicode} password        Password (will be encoded with utf-8) +    @param {int} key_size_bytes          Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit +    @returns {str}                       Decrypted data +    """ +    NONCE_LENGTH_BYTES = 8 +     +    data = bytes_to_intlist(base64.b64decode(data)) +    password = bytes_to_intlist(password.encode('utf-8')) +     +    key = password[:key_size_bytes] + [0]*(key_size_bytes - len(password)) +    key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) +     +    nonce = data[:NONCE_LENGTH_BYTES] +    cipher = data[NONCE_LENGTH_BYTES:] +     +    class Counter: +        __value = nonce + [0]*(BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) +        def next_value(self): +            temp = self.__value +            self.__value = inc(self.__value) +            return temp +     +    decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) +    plaintext = intlist_to_bytes(decrypted_data) +     +    return plaintext + +RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) +SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, +        0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, +        0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, +        0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, +        0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, +        0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, +        0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, +        0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, +        0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, +        0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, +        0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, +        0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, +        0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, +        0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, +        0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, +        0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) +MIX_COLUMN_MATRIX = ((2,3,1,1), +                     (1,2,3,1), +                     (1,1,2,3), +                     (3,1,1,2)) + +def sub_bytes(data): +    return [SBOX[x] for x in data] + +def rotate(data): +    return data[1:] + [data[0]] + +def key_schedule_core(data, rcon_iteration): +    data = rotate(data) +    data = sub_bytes(data) +    data[0] = data[0] ^ RCON[rcon_iteration] +     +    return data + +def xor(data1, data2): +    return [x^y for x, y in zip(data1, data2)] + +def mix_column(data): +    data_mixed = [] +    for row in range(4): +        mixed = 0 +        for column in range(4): +            addend = data[column] +            if MIX_COLUMN_MATRIX[row][column] in (2,3): +                addend <<= 1 +                if addend > 0xff: +                    addend &= 0xff +                    addend ^= 0x1b +                if MIX_COLUMN_MATRIX[row][column] == 3: +                    addend ^= data[column] +            mixed ^= addend & 0xff +        data_mixed.append(mixed) +    return data_mixed + +def mix_columns(data): +    data_mixed = [] +    for i in range(4): +        column = data[i*4 : (i+1)*4] +        data_mixed += mix_column(column) +    return data_mixed + +def shift_rows(data): +    data_shifted = [] +    for column in range(4): +        for row in range(4): +            data_shifted.append( data[((column + row) & 0b11) * 4 + row] ) +    return data_shifted + +def inc(data): +    data = data[:] # copy +    for i in range(len(data)-1,-1,-1): +        if data[i] == 255: +            data[i] = 0 +        else: +            data[i] = data[i] + 1 +            break +    return data diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f71ae2713..6b5037c8c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ +from .appletrailers import AppleTrailersIE +from .addanime import AddAnimeIE  from .archiveorg import ArchiveOrgIE  from .ard import ARDIE  from .arte import ArteTvIE @@ -6,7 +8,10 @@ from .bandcamp import BandcampIE  from .bliptv import BlipTVIE, BlipTVUserIE  from .breakcom import BreakIE  from .brightcove import BrightcoveIE +from .c56 import C56IE  from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .cnn import CNNIE  from .collegehumor import CollegeHumorIE  from .comedycentral import ComedyCentralIE  from .condenast import CondeNastIE @@ -45,12 +50,14 @@ from .keek import KeekIE  from .liveleak import LiveLeakIE  from .livestream import LivestreamIE  from .metacafe import MetacafeIE +from .mit import TechTVMITIE, MITIE  from .mixcloud import MixcloudIE  from .mtv import MTVIE  from .muzu import MuzuTVIE  from .myspass import MySpassIE  from .myvideo import MyVideoIE  from .nba import NBAIE +from .nbc import NBCNewsIE  from .ooyala import OoyalaIE  from .pbs import PBSIE  from .photobucket import PhotobucketIE @@ -63,6 +70,7 @@ from .roxwel import RoxwelIE  from .rtlnow import RTLnowIE  from .sina import SinaIE  from .slashdot import SlashdotIE +from .sohu import SohuIE  from .soundcloud import SoundcloudIE, SoundcloudSetIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE @@ -73,18 +81,18 @@ from .ted import TEDIE  from .tf1 import TF1IE  from .thisav import ThisAVIE  from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE  from .tudou import TudouIE  from .tumblr import TumblrIE  from .tutv import TutvIE -from .ustream import UstreamIE  from .unistra import UnistraIE +from .ustream import UstreamIE  from .vbox7 import Vbox7IE  from .veoh import VeohIE  from .vevo import VevoIE  from .videofyme import VideofyMeIE  from .vimeo import VimeoIE, VimeoChannelIE  from .vine import VineIE -from .c56 import C56IE  from .wat import WatIE  from .weibo import WeiboIE  from .wimp import WimpIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..82a785a19 --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,75 @@ +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_HTTPError, +    compat_str, +    compat_urllib_parse, +    compat_urllib_parse_urlparse, + +    ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + +    _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' +    IE_NAME = u'AddAnime' +    _TEST = { +        u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', +        u'file': u'24MR3YO5SAS9.flv', +        u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', +        u'info_dict': { +            u"description": u"One Piece 606", +            u"title": u"One Piece 606" +        } +    } + +    def _real_extract(self, url): +        try: +            mobj = re.match(self._VALID_URL, url) +            video_id = mobj.group('video_id') +            webpage = self._download_webpage(url, video_id) +        except ExtractorError as ee: +            if not isinstance(ee.cause, compat_HTTPError): +                raise + +            redir_webpage = ee.cause.read().decode('utf-8') +            action = self._search_regex( +                r'<form id="challenge-form" action="([^"]+)"', +                redir_webpage, u'Redirect form') +            vc = self._search_regex( +                r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', +                redir_webpage, u'redirect vc value') +            av = re.search( +                r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', +                redir_webpage) +            if av is None: +                raise ExtractorError(u'Cannot find redirect math task') +            av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + +            parsed_url = compat_urllib_parse_urlparse(url) +            av_val = av_res + len(parsed_url.netloc) +            confirm_url = ( +                parsed_url.scheme + u'://' + parsed_url.netloc + +                action + '?' + +                compat_urllib_parse.urlencode({ +                    'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) +            self._download_webpage( +                confirm_url, video_id, +                note=u'Confirming after redirect') +            webpage = self._download_webpage(url, video_id) + +        video_url = self._search_regex(r"var normal_video_file = '(.*?)';", +                                       webpage, u'video file URL') +        video_title = self._og_search_title(webpage) +        video_description = self._og_search_description(webpage) + +        return { +            '_type': 'video', +            'id':  video_id, +            'url': video_url, +            'ext': 'flv', +            'title': video_title, +            'description': video_description +        } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..8b191c196 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,166 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +) + + +class AppleTrailersIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' +    _TEST = { +        u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", +        u"playlist": [ +            { +                u"file": u"manofsteel-trailer4.mov", +                u"md5": u"11874af099d480cc09e103b189805d5f", +                u"info_dict": { +                    u"duration": 111, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", +                    u"title": u"Trailer 4", +                    u"upload_date": u"20130523", +                    u"uploader_id": u"wb", +                }, +            }, +            { +                u"file": u"manofsteel-trailer3.mov", +                u"md5": u"07a0a262aae5afe68120eed61137ab34", +                u"info_dict": { +                    u"duration": 182, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", +                    u"title": u"Trailer 3", +                    u"upload_date": u"20130417", +                    u"uploader_id": u"wb", +                }, +            }, +            { +                u"file": u"manofsteel-trailer.mov", +                u"md5": u"e401fde0813008e3307e54b6f384cff1", +                u"info_dict": { +                    u"duration": 148, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", +                    u"title": u"Trailer", +                    u"upload_date": u"20121212", +                    u"uploader_id": u"wb", +                }, +            }, +            { +                u"file": u"manofsteel-teaser.mov", +                u"md5": u"76b392f2ae9e7c98b22913c10a639c97", +                u"info_dict": { +                    u"duration": 93, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", +                    u"title": u"Teaser", +                    u"upload_date": u"20120721", +                    u"uploader_id": u"wb", +                }, +            } +        ] +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        movie = mobj.group('movie') +        uploader_id = mobj.group('company') + +        playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' +        playlist_snippet = self._download_webpage(playlist_url, movie) +        playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) +        playlist_html = u'<html>' + playlist_cleaned + u'</html>' + +        size_cache = {} + +        doc = xml.etree.ElementTree.fromstring(playlist_html) +        playlist = [] +        for li in doc.findall('./div/ul/li'): +            title = li.find('.//h3').text +            video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() +            thumbnail = li.find('.//img').attrib['src'] + +            date_el = li.find('.//p') +            upload_date = None +            m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) +            if m: +                upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') +            runtime_el = date_el.find('./br') +            m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) +            duration = None +            if m: +                duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + +            formats = [] +            for formats_el in li.findall('.//a'): +                if formats_el.attrib['class'] != 'OverlayPanel': +                    continue +                target = formats_el.attrib['target'] + +                format_code = formats_el.text +                if 'Automatic' in format_code: +                    continue + +                size_q = formats_el.attrib['href'] +                size_id = size_q.rpartition('#videos-')[2] +                if size_id not in size_cache: +                    size_url = url + size_q +                    sizepage_html = self._download_webpage( +                        size_url, movie, +                        note=u'Downloading size info %s' % size_id, +                        errnote=u'Error while downloading size info %s' % size_id, +                    ) +                    _doc = xml.etree.ElementTree.fromstring(sizepage_html) +                    size_cache[size_id] = _doc + +                sizepage_doc = size_cache[size_id] +                links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') +                for vid_a in links: +                    href = vid_a.get('href') +                    if not href.endswith(target): +                        continue +                    detail_q = href.partition('#')[0] +                    detail_url = url + '/' + detail_q + +                    m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) +                    detail_id = m.group('detail_id') + +                    detail_html = self._download_webpage( +                        detail_url, movie, +                        note=u'Downloading detail %s %s' % (detail_id, size_id), +                        errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) +                    ) +                    detail_doc = xml.etree.ElementTree.fromstring(detail_html) +                    movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') +                    assert movie_link_el.get('class') == 'movieLink' +                    movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') +                    ext = determine_ext(movie_link) +                    assert ext == 'mov' + +                    formats.append({ +                        'format': format_code, +                        'ext': ext, +                        'url': movie_link, +                    }) + +            info = { +                '_type': 'video', +                'id': video_id, +                'title': title, +                'formats': formats, +                'title': title, +                'duration': duration, +                'thumbnail': thumbnail, +                'upload_date': upload_date, +                'uploader_id': uploader_id, +                'user_agent': 'QuickTime compatible (youtube-dl)', +            } +            # TODO: Remove when #980 has been merged +            info['url'] = formats[-1]['url'] +            info['ext'] = formats[-1]['ext'] + +            playlist.append(info) + +        return { +            '_type': 'playlist', +            'id': movie, +            'entries': playlist, +        } diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..50832217a --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + + +class Canalc2IE(InfoExtractor): +    _IE_NAME = 'canalc2.tv' +    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + +    _TEST = { +        u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', +        u'file': u'12163.mp4', +        u'md5': u'060158428b650f896c542dfbb3d6487f', +        u'info_dict': { +            u'title': u'Terrasses du Numérique' +        } +    } + +    def _real_extract(self, url): +        video_id = re.match(self._VALID_URL, url).group(1) +        webpage = self._download_webpage(url, video_id) +        file_name = self._search_regex( +            r"so\.addVariable\('file','(.*?)'\);", +            webpage, 'file name') +        video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + +        title = self._html_search_regex( +            r'class="evenement8">(.*?)</a>', webpage, u'title') +         +        return {'id': video_id, +                'ext': 'mp4', +                'url': video_url, +                'title': title, +                } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor  from ..utils import unified_strdate  class CanalplusIE(InfoExtractor): -    _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' +    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)'      _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'      IE_NAME = u'canalplus.fr' diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..a79f881cd --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,58 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class CNNIE(InfoExtractor): +    _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ +        (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' + +    _TESTS = [{ +        u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', +        u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', +        u'md5': u'3e6121ea48df7e2259fe73a0628605c4', +        u'info_dict': { +            u'title': u'Nadal wins 8th French Open title', +            u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', +        }, +    }, +    { +        u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", +        u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", +        u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", +        u"info_dict": { +            u"title": "Student's epic speech stuns new freshmen", +            u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" +        } +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        path = mobj.group('path') +        page_title = mobj.group('title') +        info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path +        info_xml = self._download_webpage(info_url, page_title) +        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + +        formats = [] +        for f in info.findall('files/file'): +            mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) +            if mf is not None: +                formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) +        formats = sorted(formats) +        (_,_,_, video_path) = formats[-1] +        video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + +        thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) +        thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + +        return {'id': info.attrib['id'], +                'title': info.find('headline').text, +                'url': video_url, +                'ext': determine_ext(video_url), +                'thumbnail': thumbnails[-1][1], +                'thumbnails': thumbs_dict, +                'description': info.find('description').text, +                } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52c4483c9..a2986cebe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -114,6 +114,11 @@ class InfoExtractor(object):          """Real extraction process. Redefine in subclasses."""          pass +    @classmethod +    def ie_key(cls): +        """A string for getting the InfoExtractor with get_info_extractor""" +        return cls.__name__[:-2] +      @property      def IE_NAME(self):          return type(self).__name__[:-2] @@ -129,7 +134,7 @@ class InfoExtractor(object):          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:              if errnote is None:                  errnote = u'Unable to download webpage' -            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) +            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):          """ Returns a tuple (page content as string, URL handle) """ @@ -140,12 +145,17 @@ class InfoExtractor(object):          urlh = self._request_webpage(url_or_request, video_id, note, errnote)          content_type = urlh.headers.get('Content-Type', '') +        webpage_bytes = urlh.read()          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)          if m:              encoding = m.group(1)          else: -            encoding = 'utf-8' -        webpage_bytes = urlh.read() +            m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]', +                          webpage_bytes[:1024]) +            if m: +                encoding = m.group(1).decode('ascii') +            else: +                encoding = 'utf-8'          if self._downloader.params.get('dump_intermediate_pages', False):              try:                  url = url_or_request.get_full_url() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d034a11bb..dc4dea4ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,8 +7,8 @@ from .common import InfoExtractor  from ..utils import (      compat_urllib_error,      compat_urllib_parse, -    compat_urllib_parse_urlparse,      compat_urllib_request, +    compat_urlparse,      ExtractorError,  ) @@ -163,10 +163,7 @@ class GenericIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          video_url = compat_urllib_parse.unquote(mobj.group(1)) -        if video_url.startswith('//'): -            video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url -        if '://' not in video_url: -            video_url = url + ('' if url.endswith('/') else '/') + video_url +        video_url = compat_urlparse.urljoin(url, video_url)          video_id = os.path.basename(video_url)          # here's a fun little line of code for you: diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 9f7fc19a4..f1cd88983 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor):              webpage, 'title', default=u'NA')          # Step 2, Simulate clicking the image box to launch video -        DOMAIN = 'https://plus.google.com' -        video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), +        DOMAIN = 'https://plus.google.com/' +        video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),              webpage, u'video page URL')          if not video_page.startswith(DOMAIN):              video_page = DOMAIN + video_page diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index ab0a69697..5bdd08afa 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,6 +1,7 @@  # -*- coding: utf-8 -*-  import re +import json  from .common import InfoExtractor  from ..utils import determine_ext @@ -12,24 +13,25 @@ class HarkIE(InfoExtractor):          u'file': u'mmbzyhkgny.mp3',          u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',          u'info_dict': { -            u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", +            u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", +            u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', +            u'duration': 11,          }      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group(1) -        embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) -        webpage = self._download_webpage(embed_url, video_id) - -        final_url = self._search_regex(r'src="(.+?).mp3"', -                                webpage, 'video url')+'.mp3' -        title = self._html_search_regex(r'<title>(.+?)</title>', -                                webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( -                                'Sound Clip , Quote, MP3, and Ringtone - Hark','') +        json_url = "http://www.hark.com/clips/%s.json" %(video_id) +        info_json = self._download_webpage(json_url, video_id) +        info = json.loads(info_json) +        final_url = info['url']          return {'id': video_id,                  'url' : final_url, -                'title': title, +                'title': info['name'],                  'ext': determine_ext(final_url), +                'description': info['description'], +                'thumbnail': info['image_original'], +                'duration': info['duration'],                  } diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 8537ba584..445d46501 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -21,8 +21,10 @@ class KankanIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') -        gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') +        title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title') +        surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) +        gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls) +        gcid = gcids[-1]          video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,                                                   video_id, u'Downloading video url info') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..d09d03e36 --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,76 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): +    IE_NAME = u'techtv.mit.edu' +    _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', +        u'file': u'25418.mp4', +        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', +        u'info_dict': { +            u'title': u'MIT DNA Learning Center Set', +            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage( +            'http://techtv.mit.edu/videos/%s' % video_id, video_id) +        embed_page = self._download_webpage( +            'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, +            note=u'Downloading embed page') + +        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', +            embed_page, u'base url') +        formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, +            u'video formats') +        formats = json.loads(formats_json) +        formats = sorted(formats, key=lambda f: f['bitrate']) + +        title = get_element_by_id('edit-title', webpage) +        description = clean_html(get_element_by_id('edit-description', webpage)) +        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', +            embed_page, u'thumbnail', flags=re.DOTALL) + +        return {'id': video_id, +                'title': title, +                'url': base_url + formats[-1]['url'].replace('mp4:', ''), +                'ext': 'mp4', +                'description': description, +                'thumbnail': thumbnail, +                } + + +class MITIE(TechTVMITIE): +    IE_NAME = u'video.mit.edu' +    _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' + +    _TEST = { +        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', +        u'file': u'21783.mp4', +        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', +        u'info_dict': { +            u'title': u'The Government is Profiling You', +            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        page_title = mobj.group('title') +        webpage = self._download_webpage(url, page_title) +        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) +        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, +            u'embed url') +        return self.url_result(embed_url, ie='TechTVMIT') diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): +    _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', +        u'file': u'52753292.flv', +        u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', +        u'info_dict': { +            u'title': u'Crew emerges after four-month Mars food study', +            u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) +        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + +        return {'id': video_id, +                'title': info.find('headline').text, +                'ext': 'flv', +                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, +                'description': compat_str(info.find('caption').text), +                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, +                } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 000000000..77bb0a8dc --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,90 @@ +# encoding: utf-8 + +import json +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SohuIE(InfoExtractor): +    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?' + +    _TEST = { +        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', +        u'file': u'382479172.mp4', +        u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', +        u'info_dict': { +            u'title': u'MV:Far East Movement《The Illest》', +        }, +    } + +    def _real_extract(self, url): + +        def _fetch_data(vid_id): +            base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' +            data_url = base_data_url + str(vid_id) +            data_json = self._download_webpage( +                data_url, video_id, +                note=u'Downloading JSON data for ' + str(vid_id)) +            return json.loads(data_json) + +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>', +                                            webpage, u'video title') +        title = raw_title.partition('-')[0].strip() + +        vid = self._html_search_regex(r'var vid="(\d+)"', webpage, +                                      u'video path') +        data = _fetch_data(vid) + +        QUALITIES = ('ori', 'super', 'high', 'nor') +        vid_ids = [data['data'][q + 'Vid'] +                   for q in QUALITIES +                   if data['data'][q + 'Vid'] != 0] +        if not vid_ids: +            raise ExtractorError(u'No formats available for this video') + +        # For now, we just pick the highest available quality +        vid_id = vid_ids[-1] + +        format_data = data if vid == vid_id else _fetch_data(vid_id) +        part_count = format_data['data']['totalBlocks'] +        allot = format_data['allot'] +        prot = format_data['prot'] +        clipsURL = format_data['data']['clipsURL'] +        su = format_data['data']['su'] + +        playlist = [] +        for i in range(part_count): +            part_url = ('http://%s/?prot=%s&file=%s&new=%s' % +                        (allot, prot, clipsURL[i], su[i])) +            part_str = self._download_webpage( +                part_url, video_id, +                note=u'Downloading part %d of %d' % (i+1, part_count)) + +            part_info = part_str.split('|') +            video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + +            video_info = { +                'id': '%s_part%02d' % (video_id, i + 1), +                'title': title, +                'url': video_url, +                'ext': 'mp4', +            } +            playlist.append(video_info) + +        if len(playlist) == 1: +            info = playlist[0] +            info['id'] = video_id +        else: +            info = { +                '_type': 'playlist', +                'entries': playlist, +                'id': video_id, +            } + +        return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py new file mode 100644 index 000000000..f278951ba --- /dev/null +++ b/youtube_dl/extractor/trilulilu.py @@ -0,0 +1,73 @@ +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class TriluliluIE(InfoExtractor): +    _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)' +    _TEST = { +        u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1", +        u'file': u"big-buck-bunny-1.mp4", +        u'info_dict': { +            u"title": u"Big Buck Bunny", +            u"description": u":) pentru copilul din noi", +        }, +        # Server ignores Range headers (--test) +        u"params": { +            u"skip_download": True +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('video_id') + +        webpage = self._download_webpage(url, video_id) + +        title = self._og_search_title(webpage) +        thumbnail = self._og_search_thumbnail(webpage) +        description = self._og_search_description(webpage) + +        log_str = self._search_regex( +            r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info') +        log = json.loads(log_str) + +        format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' +                      u'video-formats2' % log) +        format_str = self._download_webpage( +            format_url, video_id, +            note=u'Downloading formats', +            errnote=u'Error while downloading formats') + +        format_doc = xml.etree.ElementTree.fromstring(format_str) +  +        video_url_template = ( +            u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' +            u'&source=site&hash=%(hash)s&username=%(userid)s&' +            u'key=ministhebest&format=%%s&sig=&exp=' % +            log) +        formats = [ +            { +                'format': fnode.text, +                'url': video_url_template % fnode.text, +            } + +            for fnode in format_doc.findall('./formats/format') +        ] + +        info = { +            '_type': 'video', +            'id': video_id, +            'formats': formats, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +        } + +        # TODO: Remove when #980 has been merged +        info['url'] = formats[-1]['url'] +        info['ext'] = formats[-1]['format'].partition('-')[0] + +        return info diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7d228edac..29c25f0e3 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -6,7 +6,6 @@ import re  from .common import InfoExtractor  from ..utils import ( -    compat_urllib_parse,      unified_strdate,  ) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d1156bf42..c85fd4b5a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -12,14 +12,16 @@ from ..utils import (      unescapeHTML,      unified_strdate,  ) - +from ..aes import ( +    aes_decrypt_text +)  class YouPornIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'      _TEST = {          u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',          u'file': u'505835.mp4', -        u'md5': u'c37ddbaaa39058c76a7e86c6813423c1', +        u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',          u'info_dict': {              u"upload_date": u"20101221",               u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",  @@ -75,7 +77,15 @@ class YouPornIE(InfoExtractor):          # Get all of the links from the page          LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'          links = re.findall(LINK_RE, download_list_html) -        if(len(links) == 0): +         +        # Get link of hd video if available +        mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage) +        if mobj != None: +            encrypted_video_url = mobj.group(u'encrypted_video_url') +            video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') +            links = [video_url] + links +         +        if not links:              raise ExtractorError(u'ERROR: no known formats available for video')          self.to_screen(u'Links found: %d' % len(links)) @@ -112,7 +122,7 @@ class YouPornIE(InfoExtractor):              self._print_formats(formats)              return -        req_format = self._downloader.params.get('format', None) +        req_format = self._downloader.params.get('format', 'best')          self.to_screen(u'Format: %s' % req_format)          if req_format is None or req_format == 'best': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index af01c9da0..8e486afd0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          elif len(s) == 89:              return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]          elif len(s) == 88: -            return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] +            return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]          elif len(s) == 87:              return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]          elif len(s) == 86: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ab1049cc0..b3d0f64ea 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,19 +1,20 @@  #!/usr/bin/env python  # -*- coding: utf-8 -*- +import datetime +import email.utils  import errno  import gzip  import io  import json  import locale  import os +import platform  import re +import socket  import sys  import traceback  import zlib -import email.utils -import socket -import datetime  try:      import urllib.request as compat_urllib_request @@ -61,6 +62,11 @@ except ImportError: # Python 2      import httplib as compat_http_client  try: +    from urllib.error import HTTPError as compat_HTTPError +except ImportError:  # Python 2 +    from urllib2 import HTTPError as compat_HTTPError + +try:      from subprocess import DEVNULL      compat_subprocess_get_DEVNULL = lambda: DEVNULL  except ImportError: @@ -476,7 +482,7 @@ def formatSeconds(secs):  def make_HTTPS_handler(opts):      if sys.version_info < (3,2):          # Python's 2.x handler is very simplistic -        return YoutubeDLHandlerHTTPS() +        return compat_urllib_request.HTTPSHandler()      else:          import ssl          context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) @@ -485,11 +491,11 @@ def make_HTTPS_handler(opts):          context.verify_mode = (ssl.CERT_NONE                                 if opts.no_check_certificate                                 else ssl.CERT_REQUIRED) -        return YoutubeDLHandlerHTTPS(context=context) +        return compat_urllib_request.HTTPSHandler(context=context)  class ExtractorError(Exception):      """Error during info extraction.""" -    def __init__(self, msg, tb=None, expected=False): +    def __init__(self, msg, tb=None, expected=False, cause=None):          """ tb, if given, is the original traceback (so that it can be printed out).          If expected is set, this is a normal error message and most likely not a bug in youtube-dl.          """ @@ -502,6 +508,7 @@ class ExtractorError(Exception):          self.traceback = tb          self.exc_info = sys.exc_info()  # preserve original exception +        self.cause = cause      def format_traceback(self):          if self.traceback is None: @@ -569,8 +576,7 @@ class ContentTooShortError(Exception):          self.downloaded = downloaded          self.expected = expected - -class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler +class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      """Handler for HTTP requests and responses.      This class, when installed with an OpenerDirector, automatically adds @@ -603,8 +609,8 @@ class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler          ret.code = code          return ret -    def _http_request(self, req): -        for h, v in std_headers.items(): +    def http_request(self, req): +        for h,v in std_headers.items():              if h in req.headers:                  del req.headers[h]              req.add_header(h, v) @@ -619,12 +625,27 @@ class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler              del req.headers['Youtubedl-user-agent']          return req -    def _http_response(self, req, resp): +    def http_response(self, req, resp):          old_resp = resp          # gzip          if resp.headers.get('Content-encoding', '') == 'gzip': -            gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') -            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) +            content = resp.read() +            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') +            try: +                uncompressed = io.BytesIO(gz.read()) +            except IOError as original_ioerror: +                # There may be junk add the end of the file +                # See http://stackoverflow.com/q/4928560/35070 for details +                for i in range(1, 1024): +                    try: +                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') +                        uncompressed = io.BytesIO(gz.read()) +                    except IOError: +                        continue +                    break +                else: +                    raise original_ioerror +            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg          # deflate          if resp.headers.get('Content-encoding', '') == 'deflate': @@ -633,16 +654,8 @@ class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler              resp.msg = old_resp.msg          return resp - -class YoutubeDLHandler(YoutubeDLHandler_Template, compat_urllib_request.HTTPHandler): -    http_request = YoutubeDLHandler_Template._http_request -    http_response = YoutubeDLHandler_Template._http_response - - -class YoutubeDLHandlerHTTPS(YoutubeDLHandler_Template, compat_urllib_request.HTTPSHandler): -    https_request = YoutubeDLHandler_Template._http_request -    https_response = YoutubeDLHandler_Template._http_response - +    https_request = http_request +    https_response = http_response  def unified_strdate(date_str):      """Return a string with the date in the format YYYYMMDD""" @@ -720,3 +733,31 @@ class DateRange(object):          return self.start <= date <= self.end      def __str__(self):          return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) + + +def platform_name(): +    """ Returns the platform name as a compat_str """ +    res = platform.platform() +    if isinstance(res, bytes): +        res = res.decode(preferredencoding()) + +    assert isinstance(res, compat_str) +    return res + + +def bytes_to_intlist(bs): +    if not bs: +        return [] +    if isinstance(bs[0], int):  # Python 3 +        return list(bs) +    else: +        return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): +    if not xs: +        return b'' +    if isinstance(chr(0), bytes):  # Python 2 +        return ''.join([chr(x) for x in xs]) +    else: +        return bytes(xs) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c10ebd4e8..0b56e48dc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.23' +__version__ = '2013.08.28'  | 
