From 56c7366547462ecec0536df58971249a8a870ddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 8 Jul 2013 15:14:27 +0200 Subject: YoutubeIE: reuse instances of InfoExtractors (closes #998) When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one. That way _real_initialize is only called once for each IE, saving time if it needs to login for example. --- youtube_dl/YoutubeDL.py | 18 +++++++++++++++--- youtube_dl/extractor/common.py | 5 +++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d3281fed2..cd3d6ea7b 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,6 +97,7 @@ class YoutubeDL(object): def __init__(self, params): """Create a FileDownloader object with the given options.""" self._ies = [] + self._ies_instances = {} self._pps = [] self._progress_hooks = [] self._download_retcode = 0 @@ -111,8 +112,21 @@ class YoutubeDL(object): def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) + self._ies_instances[ie.ie_key()] = ie ie.set_downloader(self) + def get_info_extractor(self, ie_key): + """ + Get an instance of an IE with name ie_key, it will try to get one from + the _ies list, if there's no instance it will create a new one and add + it to the extractor list. + """ + ie = self._ies_instances.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key)() + self.add_info_extractor(ie) + return ie + def add_default_info_extractors(self): """ Add the InfoExtractors returned by gen_extractors to the end of the list @@ -294,9 +308,7 @@ class YoutubeDL(object): ''' if ie_key: - ie = get_info_extractor(ie_key)() - ie.set_downloader(self) - ies = [ie] + ies = [self.get_info_extractor(ie_key)] else: ies = self._ies diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1d98222ce..236c7b12c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -106,6 +106,11 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] -- cgit v1.2.3 From 6d3a7d03e14fcbc704bf30d305fb95c5829e55a6 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Fri, 2 Aug 2013 15:26:11 +0800 Subject: fix bug: kankan extractor not support http://vod.kankan.com/v/70/70309.shtml --- youtube_dl/extractor/kankan.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 8537ba584..445d46501 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -21,8 +21,10 @@ class KankanIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') - gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title') + surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) + gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls) + gcid = gcids[-1] video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, video_id, u'Downloading video url info') -- cgit v1.2.3 From 6624a2b07dafad4de895b4e84f4595214817518d Mon Sep 17 00:00:00 2001 From: huohuarong Date: Fri, 2 Aug 2013 17:58:46 +0800 Subject: add an extractor for tv.sohu.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sohu.py | 97 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/sohu.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c20172a53..3a08d676f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -55,6 +55,7 @@ from .redtube import RedTubeIE from .ringtv import RingTVIE from .roxwel import RoxwelIE from .sina import SinaIE +from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 000000000..830814221 --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,97 @@ +# encoding: utf-8 + +import re +import json +import time +import logging +import urllib2 + +from .common import InfoExtractor +from ..utils import compat_urllib_request + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P\d+)\.shtml.*?' + + _TEST = { + u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', + u'file': u'382479172.flv', + u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b', + u'info_dict': { + u'title': u'The Illest - Far East Movement Riff Raff', + }, + } + + def _clearn_html(self, string): + tags = re.findall(r'<.+?>', string) + for t in tags: + string = string.replace(t, ' ') + for i in range(2): + spaces = re.findall(r'\s+', string) + for s in spaces: + string = string.replace(s, ' ') + string = string.strip() + return string + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + pattern = r'

\n*?(.+?)\n*?

' + compiled = re.compile(pattern, re.DOTALL) + title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') + title = self._clearn_html(title) + pattern = re.compile(r'var vid="(\d+)"') + result = re.search(pattern, webpage) + if not result: + logging.info('[Sohu] could not get vid') + return None + vid = result.group(1) + logging.info('vid: %s' % vid) + base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + url_1 = base_url_1 + vid + logging.info('json url: %s' % url_1) + json_1 = json.loads(urllib2.urlopen(url_1).read()) + # get the highest definition video vid and json infomation. + vids = [] + qualities = ('oriVid', 'superVid', 'highVid', 'norVid') + for vid_name in qualities: + vids.append(json_1['data'][vid_name]) + clearest_vid = 0 + for i, v in enumerate(vids): + if v != 0: + clearest_vid = v + logging.info('quality definition: %s' % qualities[i][:-3]) + break + if not clearest_vid: + logging.warning('could not find valid clearest_vid') + return None + if vid != clearest_vid: + url_1 = '%s%d' % (base_url_1, clearest_vid) + logging.info('highest definition json url: %s' % url_1) + json_1 = json.loads(urllib2.urlopen(url_1).read()) + allot = json_1['allot'] + prot = json_1['prot'] + clipsURL = json_1['data']['clipsURL'] + su = json_1['data']['su'] + num_of_parts = json_1['data']['totalBlocks'] + logging.info('Total parts: %d' % num_of_parts) + base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' + files_info = [] + for i in range(num_of_parts): + middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) + logging.info('middle url part %d: %s' % (i, middle_url)) + middle_info = urllib2.urlopen(middle_url).read().split('|') + middle_part_1 = middle_info[0] + download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3]) + + info = { + 'id': '%s_part%02d' % (video_id, i + 1), + 'title': title, + 'url': download_url, + 'ext': 'mp4', + } + files_info.append(info) + time.sleep(1) + + return files_info -- cgit v1.2.3 From 4ec929dc9b55a2588b4a27e64871c5bfa900bf37 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Sat, 3 Aug 2013 10:29:58 +0800 Subject: use ..utils/clean_html() --- youtube_dl/extractor/sohu.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 830814221..cf0ab5478 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -7,7 +7,7 @@ import logging import urllib2 from .common import InfoExtractor -from ..utils import compat_urllib_request +from ..utils import compat_urllib_request, clean_html class SohuIE(InfoExtractor): @@ -22,16 +22,6 @@ class SohuIE(InfoExtractor): }, } - def _clearn_html(self, string): - tags = re.findall(r'<.+?>', string) - for t in tags: - string = string.replace(t, ' ') - for i in range(2): - spaces = re.findall(r'\s+', string) - for s in spaces: - string = string.replace(s, ' ') - string = string.strip() - return string def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -40,7 +30,7 @@ class SohuIE(InfoExtractor): pattern = r'

\n*?(.+?)\n*?

' compiled = re.compile(pattern, re.DOTALL) title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') - title = self._clearn_html(title) + title = clean_html(title) pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -93,5 +83,8 @@ class SohuIE(InfoExtractor): } files_info.append(info) time.sleep(1) - + if num_of_parts == 1: + info = files_info[0] + info['id'] = video_id + return info return files_info -- cgit v1.2.3 From 968b5e0112a83f2a4637226d4d743b394ebed038 Mon Sep 17 00:00:00 2001 From: Dan Church Date: Sun, 4 Aug 2013 12:45:24 -0500 Subject: Add some verbosity when reporting finished downloads For example: [download] Resuming download at byte 1868140 [download] Destination: Entry #1-Bn59FJ4HrmU.flv [download] 100% of 3.27MiB in 4s This format is meant to somewhat mirror the behavior of wget(1) when reporting finished downloads: 100%[==================>] 54,836,682 788KB/s in 74s 2013-08-04 12:32:05 (728 KB/s) - 'google-chrome-stable_current_x86_64.rpm' saved [54836682/54836682] --- youtube_dl/FileDownloader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index ea6b9d626..ab06533c0 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -230,12 +230,14 @@ class FileDownloader(object): """Report it was impossible to resume download.""" self.to_screen(u'[download] Unable to resume') - def report_finish(self): + def report_finish(self, data_len_str, tot_time): """Report download finished.""" if self.params.get('noprogress', False): self.to_screen(u'[download] Download completed') else: - self.to_screen(u'') + clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + self.to_screen(u'\r%s[download] 100%% of %s in %ss' % + (clear_line, data_len_str, int(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) @@ -538,7 +540,7 @@ class FileDownloader(object): self.report_error(u'Did not get any data blocks') return False stream.close() - self.report_finish() + self.report_finish(data_len_str, (time.time() - start)) if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) -- cgit v1.2.3 From b5a6d408181c118bf51382f486a2492643ed74ec Mon Sep 17 00:00:00 2001 From: huohuarong Date: Mon, 5 Aug 2013 22:51:54 +0800 Subject: fix parse title bug --- youtube_dl/extractor/sohu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index cf0ab5478..cd049b6f0 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -27,10 +27,10 @@ class SohuIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - pattern = r'

\n*?(.+?)\n*?

' + pattern = r'(.+?)' compiled = re.compile(pattern, re.DOTALL) - title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') - title = clean_html(title) + title = self._search_regex(compiled, webpage, u'video title') + title = clean_html(title).split('-')[0].strip() pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -41,7 +41,8 @@ class SohuIE(InfoExtractor): base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' url_1 = base_url_1 + vid logging.info('json url: %s' % url_1) - json_1 = json.loads(urllib2.urlopen(url_1).read()) + webpage = self._download_webpage(url_1, vid) + json_1 = json.loads(webpage) # get the highest definition video vid and json infomation. vids = [] qualities = ('oriVid', 'superVid', 'highVid', 'norVid') -- cgit v1.2.3 From 461cead4f788f6a69902f350b9143a5e1588b57d Mon Sep 17 00:00:00 2001 From: tsantala Date: Tue, 6 Aug 2013 04:34:24 +0300 Subject: changes --- youtube_dl/extractor/AddAnime.py | 54 ++++++++++++++++++++++++++++++++++++++++ youtube_dl/extractor/__init__.py | 2 ++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/AddAnime.py diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py new file mode 100644 index 000000000..43b0b24fe --- /dev/null +++ b/youtube_dl/extractor/AddAnime.py @@ -0,0 +1,54 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) +from bs4 import BeautifulSoup + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^(?:http?://)?(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'137499050692ced.flv', + u'md5': u'0813c2430bea7a46bf13acf3406992f4', + u'info_dict': { + u"description": u"One Piece 606", + u"uploader": u"mugiwaraQ8", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r'var normal_video_file = "(.*?)",', + webpage, u'video URL') + + video_title = self._og_search_title(webpage) + + video_description = self._og_search_description(webpage) + + soup = BeautifulSoup(webpage) + + video_uploader= soup.find("meta", {"author":""})['content'] + + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader + } + + return [info] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..28dcb2cc4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ + +from .AddAnime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE -- cgit v1.2.3 From d5b00ee6e0ba70fd5d87752e8772fc1c39e4bd59 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Tue, 6 Aug 2013 10:26:57 +0800 Subject: improve sohu extractor --- youtube_dl/extractor/sohu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index cd049b6f0..24fc3a5d7 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -31,6 +31,7 @@ class SohuIE(InfoExtractor): compiled = re.compile(pattern, re.DOTALL) title = self._search_regex(compiled, webpage, u'video title') title = clean_html(title).split('-')[0].strip() + self.to_screen('Title: %s' % title) pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -70,6 +71,7 @@ class SohuIE(InfoExtractor): base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' files_info = [] for i in range(num_of_parts): + self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts)) middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) logging.info('middle url part %d: %s' % (i, middle_url)) middle_info = urllib2.urlopen(middle_url).read().split('|') -- cgit v1.2.3 From f3bcebb1d2ebf6a69f06b72e1e365bc76970e1e2 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Fri, 9 Aug 2013 18:36:01 +0200 Subject: add an aes implementation --- youtube_dl/aes.py | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 youtube_dl/aes.py diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py new file mode 100644 index 000000000..2fa9238e3 --- /dev/null +++ b/youtube_dl/aes.py @@ -0,0 +1,200 @@ +__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text'] + +import base64 +from math import ceil + +BLOCK_SIZE_BYTES = 16 + +def aes_ctr_decrypt(data, key, counter): + """ + Decrypt with aes in counter mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {instance} counter Instance whose next_value function (@returns {int[]} 16-Byte block) + returns the next counter block + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data=[] + for i in range(block_count): + counter_block = counter.next_value() + block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES] + block += [0]*(BLOCK_SIZE_BYTES - len(block)) + + cipher_counter_block = aes_encrypt(counter_block, expanded_key) + decrypted_data += xor(block, cipher_counter_block) + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes/4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) + + for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes : 4-key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + +def aes_encrypt(data, expanded_key): + """ + Encrypt one block with aes + + @param {int[]} data 16-Byte state + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte cipher + """ + rounds = len(expanded_key) / BLOCK_SIZE_BYTES - 1 + + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + for i in range(1, rounds+1): + data = sub_bytes(data) + data = shift_rows(data) + if i != rounds: + data = mix_columns(data) + data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) + + return data + +def aes_decrypt_text(data, password, key_size_bytes): + """ + Decrypt text + - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter + - The cipher key is retrieved by encrypting the first 16 Byte of 'password' + with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) + - Mode of operation is 'counter' + + @param {str} data Base64 encoded string + @param {str,unicode} password Password (will be encoded with utf-8) + @param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit + @returns {str} Decrypted data + """ + NONCE_LENGTH_BYTES = 8 + + data = map(lambda c: ord(c), base64.b64decode(data)) + password = map(lambda c: ord(c), password.encode('utf-8')) + + key = password[:key_size_bytes] + [0]*(key_size_bytes - len(password)) + key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes / BLOCK_SIZE_BYTES) + + nonce = data[:NONCE_LENGTH_BYTES] + cipher = data[NONCE_LENGTH_BYTES:] + + class Counter: + __value = nonce + [0]*(BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) + def next_value(self): + temp = self.__value + self.__value = inc(self.__value) + return temp + + decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) + plaintext = ''.join(map(lambda x: chr(x), decrypted_data)) + + return plaintext + +RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) +SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) +MIX_COLUMN_MATRIX = ((2,3,1,1), + (1,2,3,1), + (1,1,2,3), + (3,1,1,2)) + +def sub_bytes(data): + return map(lambda x: SBOX[x], data) + +def rotate(data): + return data[1:] + [data[0]] + +def key_schedule_core(data, rcon_iteration): + data = rotate(data) + data = sub_bytes(data) + data[0] = data[0] ^ RCON[rcon_iteration] + + return data + +def xor(data1, data2): + return map(lambda (x,y): x^y, zip(data1, data2)) + +def mix_column(data): + data_mixed = [] + for row in range(4): + mixed = 0 + for column in range(4): + addend = data[column] + if MIX_COLUMN_MATRIX[row][column] in (2,3): + addend <<= 1 + if addend > 0xff: + addend &= 0xff + addend ^= 0x1b + if MIX_COLUMN_MATRIX[row][column] == 3: + addend ^= data[column] + mixed ^= addend & 0xff + data_mixed.append(mixed) + return data_mixed + +def mix_columns(data): + data_mixed = [] + for i in range(4): + column = data[i*4 : (i+1)*4] + data_mixed += mix_column(column) + return data_mixed + +def shift_rows(data): + data_shifted = [] + for column in range(4): + for row in range(4): + data_shifted.append( data[((column + row) & 0b11) * 4 + row] ) + return data_shifted + +def inc(data): + data = data[:] # copy + for i in range(len(data)-1,-1,-1): + if data[i] == 255: + data[i] = 0 + else: + data[i] = data[i] + 1 + break + return data -- cgit v1.2.3 From 97b3656c2e37e45d556816b8f1f15c20d14f1acd Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Fri, 9 Aug 2013 18:37:33 +0200 Subject: YoupornIE: Add support for hd videos and update Test --- youtube_dl/extractor/youporn.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d1156bf42..cc9c37027 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -12,14 +12,16 @@ from ..utils import ( unescapeHTML, unified_strdate, ) - +from ..aes import ( + aes_decrypt_text +) class YouPornIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' _TEST = { u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', u'file': u'505835.mp4', - u'md5': u'c37ddbaaa39058c76a7e86c6813423c1', + u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', u'info_dict': { u"upload_date": u"20101221", u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", @@ -75,6 +77,14 @@ class YouPornIE(InfoExtractor): # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) + + # Get link of hd video + encrypted_video_url = self._html_search_regex(r'var encryptedURL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', + webpage, u'encrypted_video_url') + video_url = unicode( aes_decrypt_text(encrypted_video_url, video_title, 32), 'utf-8') + if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates + links = [video_url] + links + if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') -- cgit v1.2.3 From 5a27ecdd2ec83ba6e1069428c4c0fb3bd61f638c Mon Sep 17 00:00:00 2001 From: kkalpakloglou <kkalpakloglou@yahoo.com> Date: Fri, 16 Aug 2013 23:54:09 +0300 Subject: Update AddAnime.py --- youtube_dl/extractor/AddAnime.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py index 43b0b24fe..a312fa97e 100644 --- a/youtube_dl/extractor/AddAnime.py +++ b/youtube_dl/extractor/AddAnime.py @@ -1,11 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) -from bs4 import BeautifulSoup - class AddAnimeIE(InfoExtractor): @@ -17,7 +12,6 @@ class AddAnimeIE(InfoExtractor): u'md5': u'0813c2430bea7a46bf13acf3406992f4', u'info_dict': { u"description": u"One Piece 606", - u"uploader": u"mugiwaraQ8", u"title": u"One Piece 606" } } @@ -31,24 +25,27 @@ class AddAnimeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'var normal_video_file = "(.*?)",', - webpage, u'video URL') + + def find_between( webpage, first, last ): + try: + start = webpage.index( first ) + len( first ) + end = webpage.index( last, start ) + return webpage[start:end] + except ValueError: + return "" + + video_url = find_between( webpage, "var normal_video_file = '", "';" ) video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) - - soup = BeautifulSoup(webpage) - - video_uploader= soup.find("meta", {"author":""})['content'] info = { 'id': video_id, 'url': video_url, 'ext': 'flv', 'title': video_title, - 'description': video_description, - 'uploader': video_uploader + 'description': video_description } return [info] -- cgit v1.2.3 From 943f7f7a399c6fb3006eb2bd68070f28a272171f Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <pierre@rudloff.pro> Date: Sun, 18 Aug 2013 16:11:47 +0200 Subject: Download videos from jeuxvideo.com --- youtube-dl | Bin 3445 -> 173825 bytes youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/jeuxvideo.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/jeuxvideo.py diff --git a/youtube-dl b/youtube-dl index e3eb8774c..ae8cc98e3 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..b9bd3a429 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py new file mode 100644 index 000000000..d74a1c9b4 --- /dev/null +++ b/youtube_dl/extractor/jeuxvideo.py @@ -0,0 +1,33 @@ +import json +import re + +from .common import InfoExtractor + +class JeuxVideoIE(InfoExtractor): + _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + m_download = re.search(r'<param name="flashvars" value="config=(.*?)" />', webpage) + + xml_link = m_download.group(1) + + id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + + xml_config = self._download_webpage(xml_link, title, + 'Downloading XML config') + info = re.search(r'<format\.json>(.*?)</format\.json>', + xml_config, re.MULTILINE|re.DOTALL).group(1) + info = json.loads(info)['versions'][0] + + video_url = 'http://video720.jeuxvideo.com/' + info['file'] + + track_info = {'id':id, + 'title' : title, + 'ext' : 'mp4', + 'url' : video_url + } + + return [track_info] -- cgit v1.2.3 From 7070b83687ed134af6d9a71bbf2ec759a56965d5 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <pierre@rudloff.pro> Date: Thu, 22 Aug 2013 12:54:17 +0200 Subject: Merge remote-tracking branch 'upstream/master' --- youtube_dl/extractor/jeuxvideo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index c8a8ae1b3..4327bc13d 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -31,7 +31,6 @@ class JeuxVideoIE(InfoExtractor): xml_config = self._download_webpage(xml_link, title, 'Downloading XML config') - config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) info = re.search(r'<format\.json>(.*?)</format\.json>', xml_config, re.MULTILINE|re.DOTALL).group(1) -- cgit v1.2.3 From 05a2926c5c7737e6f17ccf96a35a01ec05fe092b Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <pierre@rudloff.pro> Date: Thu, 22 Aug 2013 12:55:58 +0200 Subject: Merge remote-tracking branch 'upstream/master' --- youtube-dl | Bin 173825 -> 3445 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/youtube-dl b/youtube-dl index ae8cc98e3..e3eb8774c 100755 Binary files a/youtube-dl and b/youtube-dl differ -- cgit v1.2.3 From cd0abcc0bb4c218fd02850a139b626d252e22599 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <contact@rudloff.pro> Date: Thu, 22 Aug 2013 13:54:23 +0200 Subject: Extractor for canalc2.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/canalc2.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/canalc2.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9d12608e1..576b8433a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -7,6 +7,7 @@ from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..d0e2ed536 --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,37 @@ +# coding: utf-8 +"""Extractor for canalc2.tv""" +import re +import lxml.html + +from .common import InfoExtractor + +class Canalc2IE(InfoExtractor): + """Extractor for canalc2.tv""" + _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + + _TEST = { + u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + u'file': u'12163.mp4', + u'md5': u'c00fa80517373764ff5c0b5eb5a58780', + u'info_dict': { + u'title': u'Terrasses du Numérique' + } + } + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, video_id) + file_name = re.search(r"so\.addVariable\('file','(.*?)'\);", + webpage).group(1) + + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + + html = lxml.html.fromstring(webpage) + + title = html.cssselect('.evenement8')[0].text_content() + + return {'id': video_id, + 'ext' : 'mp4', + 'url' : video_url, + 'title' : title + } -- cgit v1.2.3 From ff2424595adf02cbe5d1f1071e53c3b2e5f32c9e Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <contact@rudloff.pro> Date: Thu, 22 Aug 2013 14:47:51 +0200 Subject: lxml is not part of the standard library. --- youtube_dl/extractor/canalc2.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index d0e2ed536..215abf537 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,7 +1,6 @@ # coding: utf-8 """Extractor for canalc2.tv""" import re -import lxml.html from .common import InfoExtractor @@ -25,10 +24,9 @@ class Canalc2IE(InfoExtractor): webpage).group(1) video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name - - html = lxml.html.fromstring(webpage) - - title = html.cssselect('.evenement8')[0].text_content() + + title = self._html_search_regex(r'class="evenement8">(.*?)</a>', + webpage, u'title') return {'id': video_id, 'ext' : 'mp4', -- cgit v1.2.3 From 341ca8d74c8f090bd696111353400f0cef2ba9bc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 01:59:00 +0200 Subject: [trilulilu] Add support for trilulilu.ro Fun fact: The ads (not yet supported) are loaded from youtube ;) --- youtube_dl/extractor/__init__.py | 5 +-- youtube_dl/extractor/trilulilu.py | 76 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 youtube_dl/extractor/trilulilu.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f71ae2713..fa53d9af9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .c56 import C56IE from .canalplus import CanalplusIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE @@ -73,18 +74,18 @@ from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE -from .ustream import UstreamIE from .unistra import UnistraIE +from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE -from .c56 import C56IE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py new file mode 100644 index 000000000..1c46156c7 --- /dev/null +++ b/youtube_dl/extractor/trilulilu.py @@ -0,0 +1,76 @@ +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class TriluliluIE(InfoExtractor): + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1", + u'file': u"big-buck-bunny-1.mp4", + u'info_dict': { + u"title": u"Big Buck Bunny", + u"description": u":) pentru copilul din noi", + }, + # Server ignores Range headers (--test) + u"params": { + u"skip_download": True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + log_str = self._search_regex( + r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info') + log = json.loads(log_str) + + format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' + u'video-formats2' % log) + format_str = self._download_webpage( + format_url, video_id, + note=u'Downloading formats', + errnote=u'Error while downloading formats') + + format_doc = xml.etree.ElementTree.fromstring(format_str) + + video_url_template = ( + u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' + u'&source=site&hash=%(hash)s&username=%(userid)s&' + u'key=ministhebest&format=%%s&sig=&exp=' % + log) + formats = [ + { + 'format': fnode.text, + 'url': video_url_template % fnode.text, + } + + for fnode in format_doc.findall('./formats/format') + ] + + info = { + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['format'].partition('-')[0] + + return info -- cgit v1.2.3 From b3889f702396b1e9641f2329d793915e5ae1454c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 02:30:47 +0200 Subject: release 2013.08.27 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c10ebd4e8..dff568640 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.23' +__version__ = '2013.08.27' -- cgit v1.2.3 From 069d098f846ca53073ec646f335f77dac4439844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 10:21:57 +0200 Subject: [canalplus] Accept player.canalplus.fr urls --- youtube_dl/extractor/canalplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' -- cgit v1.2.3 From 2a7b4da9b2ee11e88976e0e93796fd8460aa053d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 10:25:38 +0200 Subject: [hark] get the song info in JSON and extract more information. --- youtube_dl/extractor/hark.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index ab0a69697..5bdd08afa 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +import json from .common import InfoExtractor from ..utils import determine_ext @@ -12,24 +13,25 @@ class HarkIE(InfoExtractor): u'file': u'mmbzyhkgny.mp3', u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', u'info_dict': { - u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", + u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + u'duration': 11, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) - webpage = self._download_webpage(embed_url, video_id) - - final_url = self._search_regex(r'src="(.+?).mp3"', - webpage, 'video url')+'.mp3' - title = self._html_search_regex(r'<title>(.+?)', - webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( - 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + json_url = "http://www.hark.com/clips/%s.json" %(video_id) + info_json = self._download_webpage(json_url, video_id) + info = json.loads(info_json) + final_url = info['url'] return {'id': video_id, 'url' : final_url, - 'title': title, + 'title': info['name'], 'ext': determine_ext(final_url), + 'description': info['description'], + 'thumbnail': info['image_original'], + 'duration': info['duration'], } -- cgit v1.2.3 From e86ea47c029c1f95a696e43df7bea2e3e617fbc3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Aug 2013 10:35:20 +0200 Subject: [canalc2] Small improvements --- youtube_dl/extractor/canalc2.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 215abf537..50832217a 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,17 +1,17 @@ # coding: utf-8 -"""Extractor for canalc2.tv""" import re from .common import InfoExtractor + class Canalc2IE(InfoExtractor): - """Extractor for canalc2.tv""" + _IE_NAME = 'canalc2.tv' _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' _TEST = { u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', u'file': u'12163.mp4', - u'md5': u'c00fa80517373764ff5c0b5eb5a58780', + u'md5': u'060158428b650f896c542dfbb3d6487f', u'info_dict': { u'title': u'Terrasses du Numérique' } @@ -20,16 +20,16 @@ class Canalc2IE(InfoExtractor): def _real_extract(self, url): video_id = re.match(self._VALID_URL, url).group(1) webpage = self._download_webpage(url, video_id) - file_name = re.search(r"so\.addVariable\('file','(.*?)'\);", - webpage).group(1) - + file_name = self._search_regex( + r"so\.addVariable\('file','(.*?)'\);", + webpage, 'file name') video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name - title = self._html_search_regex(r'class="evenement8">(.*?)', - webpage, u'title') + title = self._html_search_regex( + r'class="evenement8">(.*?)', webpage, u'title') return {'id': video_id, - 'ext' : 'mp4', - 'url' : video_url, - 'title' : title + 'ext': 'mp4', + 'url': video_url, + 'title': title, } -- cgit v1.2.3 From 1a582dd49d628914fa6a056b490914738f15c56d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 27 Aug 2013 11:56:48 +0200 Subject: Add an extractor for CNN (closes #1318) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cnn.py | 47 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/cnn.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eeeb3db50..ea2af0d0e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..cee78765b --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,47 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + +class CNNIE(InfoExtractor): + _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P.+?/(?P[^/]+?)\.cnn)' + + _TEST = { + u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', + u'md5': u'3e6121ea48df7e2259fe73a0628605c4', + u'info_dict': { + u'title': u'Nadal wins 8th French Open title', + u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + page_title = mobj.group('title') + info_xml = self._download_webpage( + 'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + formats = [] + for f in info.findall('files/file'): + mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) + if mf is not None: + formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) + formats = sorted(formats) + (_,_,_, video_path) = formats[-1] + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + + thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) + thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + + return {'id': info.attrib['id'], + 'title': info.find('headline').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + } -- cgit v1.2.3 From 0bc56fa66a4b0f1b6bf827bd3550a119d3e3b231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 12:38:30 +0200 Subject: Add an extractor for NBC news (closes #1320) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nbc.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/nbc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ea2af0d0e..27bbcc0f7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .nbc import NBCNewsIE from .ooyala import OoyalaIE from .pbs import PBSIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', + u'file': u'52753292.flv', + u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', + u'info_dict': { + u'title': u'Crew emerges after four-month Mars food study', + u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + + return {'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } -- cgit v1.2.3 From 7f3c4f4f65ddb4f8374b31b74428780e60a373de Mon Sep 17 00:00:00 2001 From: Jeff Smith <whydoubt@yahoo.com> Date: Tue, 27 Aug 2013 14:38:50 -0500 Subject: Initial slash in Google+ photos link was removed --- youtube_dl/extractor/googleplus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 9f7fc19a4..f1cd88983 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com' - video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), + DOMAIN = 'https://plus.google.com/' + video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), webpage, u'video page URL') if not video_page.startswith(DOMAIN): video_page = DOMAIN + video_page -- cgit v1.2.3 From acebc9cd6bd713c518e80d00af13e3120614e115 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 23:15:01 +0200 Subject: Revert "Install our own HTTPS handler as well (#1309)" This reverts commit 36399e85765a6a04fd84126264af75382fcfd1f6 and fixes #1322. --- youtube_dl/utils.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ab1049cc0..52cfb8a6d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -476,7 +476,7 @@ def formatSeconds(secs): def make_HTTPS_handler(opts): if sys.version_info < (3,2): # Python's 2.x handler is very simplistic - return YoutubeDLHandlerHTTPS() + return compat_urllib_request.HTTPSHandler() else: import ssl context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) @@ -485,7 +485,7 @@ def make_HTTPS_handler(opts): context.verify_mode = (ssl.CERT_NONE if opts.no_check_certificate else ssl.CERT_REQUIRED) - return YoutubeDLHandlerHTTPS(context=context) + return compat_urllib_request.HTTPSHandler(context=context) class ExtractorError(Exception): """Error during info extraction.""" @@ -569,8 +569,7 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected - -class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -603,8 +602,8 @@ class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler ret.code = code return ret - def _http_request(self, req): - for h, v in std_headers.items(): + def http_request(self, req): + for h,v in std_headers.items(): if h in req.headers: del req.headers[h] req.add_header(h, v) @@ -619,7 +618,7 @@ class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler del req.headers['Youtubedl-user-agent'] return req - def _http_response(self, req, resp): + def http_response(self, req, resp): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': @@ -633,16 +632,8 @@ class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler resp.msg = old_resp.msg return resp - -class YoutubeDLHandler(YoutubeDLHandler_Template, compat_urllib_request.HTTPHandler): - http_request = YoutubeDLHandler_Template._http_request - http_response = YoutubeDLHandler_Template._http_response - - -class YoutubeDLHandlerHTTPS(YoutubeDLHandler_Template, compat_urllib_request.HTTPSHandler): - https_request = YoutubeDLHandler_Template._http_request - https_response = YoutubeDLHandler_Template._http_response - + https_request = http_request + https_response = http_response def unified_strdate(date_str): """Return a string with the date in the format YYYYMMDD""" -- cgit v1.2.3 From 88a79ce6a6edb6d3f8b6e3319f6ca1d3d2954c16 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 23:31:24 +0200 Subject: Delete default user agent (Fixes #1309) --- youtube_dl/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 614429073..bc6a6d180 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -430,6 +430,10 @@ def _real_main(argv=None): proxy_handler = compat_urllib_request.ProxyHandler(proxies) https_handler = make_HTTPS_handler(opts) opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders =[] compat_urllib_request.install_opener(opener) socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) -- cgit v1.2.3 From 1619e22f40883ba3c39f4d2a020cba3a1eebd34f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 23:31:36 +0200 Subject: release 2013.08.28 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dff568640..0b56e48dc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.27' +__version__ = '2013.08.28' -- cgit v1.2.3 From 273f603efb2028a54e04cca314b72bc2a9d767ef Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 00:14:19 +0200 Subject: [cnn] Allow more URLs --- youtube_dl/extractor/cnn.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index cee78765b..4338bd180 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -4,10 +4,12 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext + class CNNIE(InfoExtractor): - _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P<path>.+?/(?P<title>[^/]+?)\.cnn)' + _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' - _TEST = { + _TESTS = [{ u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', u'md5': u'3e6121ea48df7e2259fe73a0628605c4', @@ -15,14 +17,24 @@ class CNNIE(InfoExtractor): u'title': u'Nadal wins 8th French Open title', u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', }, - } + }, + { + u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", + u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", + u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", + u"info_dict": { + u"title": "Student's epic speech stuns new freshmen", + u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) path = mobj.group('path') page_title = mobj.group('title') - info_xml = self._download_webpage( - 'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title) + info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path + print(info_url) + info_xml = self._download_webpage(info_url, page_title) info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) formats = [] -- cgit v1.2.3 From 44586389e4676dfd926255cf76e36684dcf4742d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 02:18:44 +0200 Subject: [appletrailers] Add support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/appletrailers.py | 167 ++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 youtube_dl/extractor/appletrailers.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 27bbcc0f7..2f86f2aca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,4 @@ +from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..7d126e2d2 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,167 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, +) + + +class AppleTrailersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TEST = { + u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", + u"playlist": [ + { + u"file": u"manofsteel-trailer4.mov", + u"md5": u"11874af099d480cc09e103b189805d5f", + u"info_dict": { + u"duration": 111, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", + u"title": u"Trailer 4", + u"upload_date": u"20130523", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer3.mov", + u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"info_dict": { + u"duration": 182, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", + u"title": u"Trailer 3", + u"upload_date": u"20130417", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer.mov", + u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"info_dict": { + u"duration": 148, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", + u"title": u"Trailer", + u"upload_date": u"20121212", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-teaser.mov", + u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"info_dict": { + u"duration": 93, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", + u"title": u"Teaser", + u"upload_date": u"20120721", + u"uploader_id": u"wb", + }, + } + ] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_snippet = self._download_webpage(playlist_url, movie) + playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) + playlist_html = u'<html>' + playlist_cleaned + u'</html>' + + size_cache = {} + + doc = xml.etree.ElementTree.fromstring(playlist_html) + playlist = [] + for li in doc.findall('./div/ul/li'): + title = li.find('.//h3').text + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + + date_el = li.find('.//p') + upload_date = None + m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) + if m: + upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') + runtime_el = date_el.find('./br') + m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + formats = [] + for formats_el in li.findall('.//li/a'): + if formats_el.attrib['class'] != 'OverlayPanel': + continue + target = formats_el.attrib['target'] + + format_code = formats_el.text + if 'Automatic' in format_code: + continue + + size_q = formats_el.attrib['href'] + size_id = size_q.rpartition('#videos-')[2] + if size_id not in size_cache: + size_url = url + size_q + sizepage_html = self._download_webpage( + size_url, movie, + note=u'Downloading size info %s' % size_id, + errnote=u'Error while downloading size info %s' % size_id, + ) + _doc = xml.etree.ElementTree.fromstring(sizepage_html) + size_cache[size_id] = _doc + + sizepage_doc = size_cache[size_id] + links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') + for vid_a in links: + href = vid_a.get('href') + if not href.endswith(target): + continue + detail_q = href.partition('#')[0] + detail_url = url + '/' + detail_q + + m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) + detail_id = m.group('detail_id') + + detail_html = self._download_webpage( + detail_url, movie, + note=u'Downloading detail %s %s' % (detail_id, size_id), + errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) + ) + detail_doc = xml.etree.ElementTree.fromstring(detail_html) + movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') + assert movie_link_el.get('class') == 'movieLink' + movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') + ext = determine_ext(movie_link) + assert ext == 'mov' + + formats.append({ + 'format': format_code, + 'ext': ext, + 'url': movie_link, + }) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'user_agent': 'QuickTime compatible (youtube-dl)', + } + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['ext'] + + playlist.append(info) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } -- cgit v1.2.3 From 0e283428f777a23de3c5a522aa283f87cda1b40a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:18:39 +0200 Subject: HTTPError is in urllib.error in Python 3, not in http.error --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f78b5fe78..e6fa634a7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -61,7 +61,7 @@ except ImportError: # Python 2 import httplib as compat_http_client try: - from http.error import HTTPError as compat_HTTPError + from urllib.error import HTTPError as compat_HTTPError except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError -- cgit v1.2.3 From a1bb0f8773e0fff787ffe7bd1729073f3385d2ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:20:37 +0200 Subject: [cnn] remove debug print call. --- youtube_dl/extractor/cnn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 4338bd180..a79f881cd 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -33,7 +33,6 @@ class CNNIE(InfoExtractor): path = mobj.group('path') page_title = mobj.group('title') info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path - print(info_url) info_xml = self._download_webpage(info_url, page_title) info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) -- cgit v1.2.3 From 3e223834d9f358bc7cb1c3748dc63d1ab40d9b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:26:44 +0200 Subject: [youtube] update algo for length 88, thanks to @Ramhack (fixes #1328) --- devscripts/youtube_genalgo.py | 4 ++-- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 6f1d6ef99..917e8f79d 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -14,9 +14,9 @@ tests = [ # 89 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), - # 88 + # 88 - vflapUV9V 2013/08/28 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", - "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), + "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"), # 87 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index af01c9da0..8e486afd0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 89: return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: -- cgit v1.2.3 From 4f5f18acb93ea2bf70f80c7f76e6bb6b8dee3fbf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 10:28:16 +0200 Subject: [addanime] add file --- youtube_dl/extractor/addanime.py | 76 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 youtube_dl/extractor/addanime.py diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..46db8262f --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,76 @@ +import ast +import re + +from .common import InfoExtractor +from ..utils import ( + compat_HTTPError, + compat_str, + compat_urllib_parse, + compat_urllib_parse_urlparse, + + ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'24MR3YO5SAS9.flv', + u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'info_dict': { + u"description": u"One Piece 606", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + try: + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + + redir_webpage = ee.cause.read().decode('utf-8') + action = self._search_regex( + r'<form id="challenge-form" action="([^"]+)"', + redir_webpage, u'Redirect form') + vc = self._search_regex( + r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', + redir_webpage, u'redirect vc value') + av = re.search( + r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', + redir_webpage) + if av is None: + raise ExtractorError(u'Cannot find redirect math task') + av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + + parsed_url = compat_urllib_parse_urlparse(url) + av_val = av_res + len(parsed_url.netloc) + confirm_url = ( + parsed_url.scheme + u'://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse.urlencode({ + 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) + self._download_webpage( + confirm_url, video_id, + note=u'Confirming after redirect') + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description + } -- cgit v1.2.3 From af8bd6a82d140e5a776185707a9b21d5b8a9fe52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:55:31 +0200 Subject: Show the time taken to download in the same format as the ETA --- youtube_dl/FileDownloader.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 4f6a23835..7c5ac4bc2 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -63,6 +63,17 @@ class FileDownloader(object): converted = float(bytes) / float(1024 ** exponent) return '%.2f%s' % (converted, suffix) + @staticmethod + def format_seconds(seconds): + (mins, secs) = divmod(seconds, 60) + (hours, eta_mins) = divmod(mins, 60) + if hours > 99: + return '--:--:--' + if hours == 0: + return '%02d:%02d' % (mins, secs) + else: + return '%02d:%02d:%02d' % (hours, mins, secs) + @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: @@ -78,14 +89,7 @@ class FileDownloader(object): return '--:--' rate = float(current) / dif eta = int((float(total) - float(current)) / rate) - (eta_mins, eta_secs) = divmod(eta, 60) - (eta_hours, eta_mins) = divmod(eta_mins, 60) - if eta_hours > 99: - return '--:--:--' - if eta_hours == 0: - return '%02d:%02d' % (eta_mins, eta_secs) - else: - return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs) + return FileDownloader.format_seconds(eta) @staticmethod def calc_speed(start, now, bytes): @@ -240,8 +244,8 @@ class FileDownloader(object): self.to_screen(u'[download] Download completed') else: clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') - self.to_screen(u'\r%s[download] 100%% of %s in %ss' % - (clear_line, data_len_str, int(tot_time))) + self.to_screen(u'\r%s[download] 100%% of %s in %s' % + (clear_line, data_len_str, self.format_seconds(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) -- cgit v1.2.3 From aa3e950764337ef9800c936f4de89b31c00dfcf5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 11:57:13 +0200 Subject: Tolerate junk at the end of gzip-compressed content (#1268) --- youtube_dl/utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e6fa634a7..be788cf5a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -628,8 +628,23 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + content = resp.read() + gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') + try: + uncompressed = io.BytesIO(gz.read()) + except IOError as original_ioerror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') + uncompressed = io.BytesIO(gz.read()) + except IOError: + continue + break + else: + raise original_ioerror + resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # deflate if resp.headers.get('Content-encoding', '') == 'deflate': -- cgit v1.2.3 From ae3531adf926998d42d1fb52453491c85e33b5f0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:04:44 +0200 Subject: [generic] Fix URL concatenation When the url is something like http://example.org/foo/bar?x=y and the added is file/video.mp4 , we want http://example.org/foo/file/video.mp4 Fixes #1268. --- youtube_dl/extractor/generic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d034a11bb..bfc9bff49 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -166,7 +166,12 @@ class GenericIE(InfoExtractor): if video_url.startswith('//'): video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url if '://' not in video_url: - video_url = url + ('' if url.endswith('/') else '/') + video_url + up = compat_urllib_parse_urlparse(url) + if video_url.startswith('/'): + video_url = up.scheme + '://' + up.netloc + video_url + else: # relative path + video_url = (up.scheme + '://' + up.netloc + + up.path.rpartition('/')[0] + '/' + video_url) video_id = os.path.basename(video_url) # here's a fun little line of code for you: -- cgit v1.2.3 From edde6c56ac20af57d7fd494810834125bbd3728d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:14:45 +0200 Subject: Print playpath with --get-url (Fixes #1334) --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3fc4ec378..d5f7c81eb 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -448,7 +448,8 @@ class YoutubeDL(object): if self.params.get('forceid', False): compat_print(info_dict['id']) if self.params.get('forceurl', False): - compat_print(info_dict['url']) + # For RTMP URLs, also include the playpath + compat_print(info_dict['url'] + info_dict.get('play_path', u'')) if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: compat_print(info_dict['thumbnail']) if self.params.get('forcedescription', False) and 'description' in info_dict: -- cgit v1.2.3 From a5caba1eb02665cdc982d6be4a933aafd79243de Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:47:27 +0200 Subject: [generic] simply use urljoin --- youtube_dl/extractor/generic.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bfc9bff49..dc4dea4ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_error, compat_urllib_parse, - compat_urllib_parse_urlparse, compat_urllib_request, + compat_urlparse, ExtractorError, ) @@ -163,15 +163,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_url = compat_urllib_parse.unquote(mobj.group(1)) - if video_url.startswith('//'): - video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url - if '://' not in video_url: - up = compat_urllib_parse_urlparse(url) - if video_url.startswith('/'): - video_url = up.scheme + '://' + up.netloc + video_url - else: # relative path - video_url = (up.scheme + '://' + up.netloc + - up.path.rpartition('/')[0] + '/' + video_url) + video_url = compat_urlparse.urljoin(url, video_url) video_id = os.path.basename(video_url) # here's a fun little line of code for you: -- cgit v1.2.3 From ce6a696e4d964aeb27de46a31a899b28d7ca7754 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:47:38 +0200 Subject: Remove unused imports --- youtube_dl/extractor/addanime.py | 1 - youtube_dl/extractor/appletrailers.py | 1 - youtube_dl/extractor/trilulilu.py | 3 --- youtube_dl/extractor/wat.py | 1 - 4 files changed, 6 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 46db8262f..82a785a19 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -1,4 +1,3 @@ -import ast import re from .common import InfoExtractor diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index b3bdb2955..8b191c196 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -4,7 +4,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( determine_ext, - ExtractorError, ) diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 1c46156c7..f278951ba 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -3,9 +3,6 @@ import re import xml.etree.ElementTree from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class TriluliluIE(InfoExtractor): diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7d228edac..29c25f0e3 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, unified_strdate, ) -- cgit v1.2.3 From 67b22dd03686d9e360d87a7751de74b321d3f231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 12:51:22 +0200 Subject: Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mit.py | 76 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 youtube_dl/extractor/mit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c76b99a81..21e9e5d37 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -50,6 +50,7 @@ from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..d09d03e36 --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,76 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = u'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' + + _TEST = { + u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + u'file': u'25418.mp4', + u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', + u'info_dict': { + u'title': u'MIT DNA Learning Center Set', + u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + embed_page = self._download_webpage( + 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, + note=u'Downloading embed page') + + base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', + embed_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + u'video formats') + formats = json.loads(formats_json) + formats = sorted(formats, key=lambda f: f['bitrate']) + + title = get_element_by_id('edit-title', webpage) + description = clean_html(get_element_by_id('edit-description', webpage)) + thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', + embed_page, u'thumbnail', flags=re.DOTALL) + + return {'id': video_id, + 'title': title, + 'url': base_url + formats[-1]['url'].replace('mp4:', ''), + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumbnail, + } + + +class MITIE(TechTVMITIE): + IE_NAME = u'video.mit.edu' + _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' + + _TEST = { + u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + u'file': u'21783.mp4', + u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', + u'info_dict': { + u'title': u'The Government is Profiling You', + u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) + embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, + u'embed url') + return self.url_result(embed_url, ie='TechTVMIT') -- cgit v1.2.3 From c496ca96e7639e5dd0020074b7ada18c2bd4ae3e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:57:10 +0200 Subject: Fix platform name in Python 2 with --verbose (Closes #1228) --- youtube_dl/__init__.py | 3 ++- youtube_dl/utils.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index bc6a6d180..b33a18a26 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -45,6 +45,7 @@ import sys import warnings import platform + from .utils import * from .update import update_self from .version import __version__ @@ -611,7 +612,7 @@ def _real_main(argv=None): sys.exc_clear() except: pass - sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n') + sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') ydl.add_default_info_extractors() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index be788cf5a..64ab30910 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,19 +1,20 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import datetime +import email.utils import errno import gzip import io import json import locale import os +import platform import re +import socket import sys import traceback import zlib -import email.utils -import socket -import datetime try: import urllib.request as compat_urllib_request @@ -732,3 +733,13 @@ class DateRange(object): return self.start <= date <= self.end def __str__(self): return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) + + +def platform_name(): + """ Returns the platform name as a compat_str """ + res = platform.platform() + if isinstance(res, bytes): + res = res.decode(preferredencoding()) + + assert isinstance(res, compat_str) + return res -- cgit v1.2.3 From 8ae97d76eee1bf9e9098797db3be2d7b816196b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 13:37:31 +0200 Subject: PostProcessingError holds the message in the 'msg' property, not in 'message' (fixes #1323) Causes DeprecationWarning: http://www.python.org/dev/peps/pep-0352/ --- youtube_dl/PostProcessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index c02ed7148..ae56d2082 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -137,7 +137,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): try: FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts) except FFmpegPostProcessorError as err: - raise AudioConversionError(err.message) + raise AudioConversionError(err.msg) def run(self, information): path = information['filepath'] @@ -207,7 +207,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): except: etype,e,tb = sys.exc_info() if isinstance(e, AudioConversionError): - msg = u'audio conversion failed: ' + e.message + msg = u'audio conversion failed: ' + e.msg else: msg = u'error running ' + (self._exes['avconv'] and 'avconv' or 'ffmpeg') raise PostProcessingError(msg) -- cgit v1.2.3 From f143d86ad2fc0633d8e2da598cf21e73ff0f2872 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 13:59:08 +0200 Subject: [sohu] Handle encoding, and fix tests --- youtube_dl/extractor/common.py | 9 ++- youtube_dl/extractor/sohu.py | 131 ++++++++++++++++++++--------------------- 2 files changed, 71 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77a13aea5..a2986cebe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -145,12 +145,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 24fc3a5d7..77bb0a8dc 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -1,13 +1,10 @@ # encoding: utf-8 -import re import json -import time -import logging -import urllib2 +import re from .common import InfoExtractor -from ..utils import compat_urllib_request, clean_html +from ..utils import ExtractorError class SohuIE(InfoExtractor): @@ -15,79 +12,79 @@ class SohuIE(InfoExtractor): _TEST = { u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', - u'file': u'382479172.flv', - u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b', + u'file': u'382479172.mp4', + u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', u'info_dict': { - u'title': u'The Illest - Far East Movement Riff Raff', + u'title': u'MV:Far East Movement《The Illest》', }, } - def _real_extract(self, url): + + def _fetch_data(vid_id): + base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + data_url = base_data_url + str(vid_id) + data_json = self._download_webpage( + data_url, video_id, + note=u'Downloading JSON data for ' + str(vid_id)) + return json.loads(data_json) + mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) - pattern = r'<title>(.+?)' - compiled = re.compile(pattern, re.DOTALL) - title = self._search_regex(compiled, webpage, u'video title') - title = clean_html(title).split('-')[0].strip() - self.to_screen('Title: %s' % title) - pattern = re.compile(r'var vid="(\d+)"') - result = re.search(pattern, webpage) - if not result: - logging.info('[Sohu] could not get vid') - return None - vid = result.group(1) - logging.info('vid: %s' % vid) - base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - url_1 = base_url_1 + vid - logging.info('json url: %s' % url_1) - webpage = self._download_webpage(url_1, vid) - json_1 = json.loads(webpage) - # get the highest definition video vid and json infomation. - vids = [] - qualities = ('oriVid', 'superVid', 'highVid', 'norVid') - for vid_name in qualities: - vids.append(json_1['data'][vid_name]) - clearest_vid = 0 - for i, v in enumerate(vids): - if v != 0: - clearest_vid = v - logging.info('quality definition: %s' % qualities[i][:-3]) - break - if not clearest_vid: - logging.warning('could not find valid clearest_vid') - return None - if vid != clearest_vid: - url_1 = '%s%d' % (base_url_1, clearest_vid) - logging.info('highest definition json url: %s' % url_1) - json_1 = json.loads(urllib2.urlopen(url_1).read()) - allot = json_1['allot'] - prot = json_1['prot'] - clipsURL = json_1['data']['clipsURL'] - su = json_1['data']['su'] - num_of_parts = json_1['data']['totalBlocks'] - logging.info('Total parts: %d' % num_of_parts) - base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' - files_info = [] - for i in range(num_of_parts): - self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts)) - middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) - logging.info('middle url part %d: %s' % (i, middle_url)) - middle_info = urllib2.urlopen(middle_url).read().split('|') - middle_part_1 = middle_info[0] - download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3]) + raw_title = self._html_search_regex(r'(?s)(.+?)', + webpage, u'video title') + title = raw_title.partition('-')[0].strip() - info = { + vid = self._html_search_regex(r'var vid="(\d+)"', webpage, + u'video path') + data = _fetch_data(vid) + + QUALITIES = ('ori', 'super', 'high', 'nor') + vid_ids = [data['data'][q + 'Vid'] + for q in QUALITIES + if data['data'][q + 'Vid'] != 0] + if not vid_ids: + raise ExtractorError(u'No formats available for this video') + + # For now, we just pick the highest available quality + vid_id = vid_ids[-1] + + format_data = data if vid == vid_id else _fetch_data(vid_id) + part_count = format_data['data']['totalBlocks'] + allot = format_data['allot'] + prot = format_data['prot'] + clipsURL = format_data['data']['clipsURL'] + su = format_data['data']['su'] + + playlist = [] + for i in range(part_count): + part_url = ('http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clipsURL[i], su[i])) + part_str = self._download_webpage( + part_url, video_id, + note=u'Downloading part %d of %d' % (i+1, part_count)) + + part_info = part_str.split('|') + video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + video_info = { 'id': '%s_part%02d' % (video_id, i + 1), 'title': title, - 'url': download_url, + 'url': video_url, 'ext': 'mp4', } - files_info.append(info) - time.sleep(1) - if num_of_parts == 1: - info = files_info[0] + playlist.append(video_info) + + if len(playlist) == 1: + info = playlist[0] info['id'] = video_id - return info - return files_info + else: + info = { + '_type': 'playlist', + 'entries': playlist, + 'id': video_id, + } + + return info -- cgit v1.2.3 From 48ea9cea77e7ea24ee867027f03ca37dd1b935d8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Aug 2013 14:28:55 +0200 Subject: Allow changes to run under Python 3 --- youtube_dl/aes.py | 18 ++++++++++-------- youtube_dl/extractor/youporn.py | 12 ++++++++---- youtube_dl/utils.py | 10 ++++++++++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 2fa9238e3..278f8bb82 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -3,6 +3,8 @@ __all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text' import base64 from math import ceil +from .utils import bytes_to_intlist + BLOCK_SIZE_BYTES = 16 def aes_ctr_decrypt(data, key, counter): @@ -16,7 +18,7 @@ def aes_ctr_decrypt(data, key, counter): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = int(ceil(float(len(data)) // BLOCK_SIZE_BYTES)) decrypted_data=[] for i in range(block_count): @@ -40,7 +42,7 @@ def key_expansion(data): data = data[:] # copy rcon_iteration = 1 key_size_bytes = len(data) - expanded_key_size_bytes = (key_size_bytes/4 + 7) * BLOCK_SIZE_BYTES + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES while len(data) < expanded_key_size_bytes: temp = data[-4:] @@ -72,7 +74,7 @@ def aes_encrypt(data, expanded_key): @param {int[]} expanded_key 176/208/240-Byte expanded key @returns {int[]} 16-Byte cipher """ - rounds = len(expanded_key) / BLOCK_SIZE_BYTES - 1 + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) for i in range(1, rounds+1): @@ -99,11 +101,11 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = map(lambda c: ord(c), base64.b64decode(data)) - password = map(lambda c: ord(c), password.encode('utf-8')) + data = bytes_to_intlist(base64.b64decode(data)) + password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0]*(key_size_bytes - len(password)) - key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes / BLOCK_SIZE_BYTES) + key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) nonce = data[:NONCE_LENGTH_BYTES] cipher = data[NONCE_LENGTH_BYTES:] @@ -143,7 +145,7 @@ MIX_COLUMN_MATRIX = ((2,3,1,1), (3,1,1,2)) def sub_bytes(data): - return map(lambda x: SBOX[x], data) + return [SBOX[x] for x in data] def rotate(data): return data[1:] + [data[0]] @@ -156,7 +158,7 @@ def key_schedule_core(data, rcon_iteration): return data def xor(data1, data2): - return map(lambda (x,y): x^y, zip(data1, data2)) + return [x^y for x, y in zip(data1, data2)] def mix_column(data): data_mixed = [] diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index cc9c37027..19360e273 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,6 +5,7 @@ import sys from .common import InfoExtractor from ..utils import ( + compat_str, compat_urllib_parse_urlparse, compat_urllib_request, @@ -79,13 +80,16 @@ class YouPornIE(InfoExtractor): links = re.findall(LINK_RE, download_list_html) # Get link of hd video - encrypted_video_url = self._html_search_regex(r'var encryptedURL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', + encrypted_video_url = self._html_search_regex( + r'var encrypted(?:Quality[0-9]+)?URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', webpage, u'encrypted_video_url') - video_url = unicode( aes_decrypt_text(encrypted_video_url, video_title, 32), 'utf-8') + video_url = aes_decrypt_text(encrypted_video_url, video_title, 32) + print(video_url) + assert isinstance(video_url, compat_str) if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates links = [video_url] + links - if(len(links) == 0): + if not links: raise ExtractorError(u'ERROR: no known formats available for video') self.to_screen(u'Links found: %d' % len(links)) @@ -122,7 +126,7 @@ class YouPornIE(InfoExtractor): self._print_formats(formats) return - req_format = self._downloader.params.get('format', None) + req_format = self._downloader.params.get('format', 'best') self.to_screen(u'Format: %s' % req_format) if req_format is None or req_format == 'best': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 59eeaf4a8..07b40da6c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -708,3 +708,13 @@ class DateRange(object): return self.start <= date <= self.end def __str__(self): return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) + + +def bytes_to_intlist(bs): + if not bs: + return [] + if isinstance(bs[0], int): # Python 3 + return list(bs) + else: + return [ord(c) for c in bs] + -- cgit v1.2.3 From 920ef0779b6bcd5131e237e5c2ca28361f6d45d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 20 Jul 2013 12:49:24 +0200 Subject: Hide the password and username in verbose mode (closes #1089) --- youtube_dl/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index b33a18a26..431460c57 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -100,6 +100,16 @@ def parseOpts(overrideArguments=None): pass return None + def _hide_login_info(opts): + opts = list(opts) + for private_opt in ['-p', '--password', '-u', '--username']: + try: + i = opts.index(private_opt) + opts[i+1] = '' + except ValueError: + pass + return opts + max_width = 80 max_help_position = 80 @@ -358,9 +368,9 @@ def parseOpts(overrideArguments=None): argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) if opts.verbose: - sys.stderr.write(u'[debug] System config: ' + repr(systemConf) + '\n') - sys.stderr.write(u'[debug] User config: ' + repr(userConf) + '\n') - sys.stderr.write(u'[debug] Command-line args: ' + repr(commandLineConf) + '\n') + sys.stderr.write(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') + sys.stderr.write(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') + sys.stderr.write(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') return parser, opts, args -- cgit v1.2.3 From cba892fa1fd6a7f1278e637c338921c5ae236840 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Wed, 28 Aug 2013 15:59:07 +0200 Subject: Add intlist_to_bytes to utils.py --- youtube_dl/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 07b40da6c..ee8df6a5b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -718,3 +718,10 @@ def bytes_to_intlist(bs): else: return [ord(c) for c in bs] +def intlist_to_bytes(xs): + if not xs: + return b'' + if isinstance(chr(0), bytes): # Python 2 + return ''.join([chr(x) for x in xs]) + else: + return bytes(xs) -- cgit v1.2.3 From 6e74bc41ca07bda56107cfff9ceb98d6f8d28e53 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Wed, 28 Aug 2013 16:01:43 +0200 Subject: Fix division bug in aes.py --- youtube_dl/aes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 278f8bb82..9913d59a4 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -18,7 +18,7 @@ def aes_ctr_decrypt(data, key, counter): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) // BLOCK_SIZE_BYTES)) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) decrypted_data=[] for i in range(block_count): -- cgit v1.2.3 From 0012690aae977d76e9162e2334989498366a8e94 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Wed, 28 Aug 2013 16:03:35 +0200 Subject: Let aes_decrypt_text return bytes instead of unicode --- youtube_dl/aes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 9913d59a4..9a0c93fa6 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -3,7 +3,7 @@ __all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text' import base64 from math import ceil -from .utils import bytes_to_intlist +from .utils import bytes_to_intlist, intlist_to_bytes BLOCK_SIZE_BYTES = 16 @@ -118,7 +118,7 @@ def aes_decrypt_text(data, password, key_size_bytes): return temp decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) - plaintext = ''.join(map(lambda x: chr(x), decrypted_data)) + plaintext = intlist_to_bytes(decrypted_data) return plaintext -- cgit v1.2.3 From 878e83c5a4c84c7abbf3484366e76fbe906c8947 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Wed, 28 Aug 2013 16:04:48 +0200 Subject: YoupornIE: Clean up extraction of hd video --- youtube_dl/extractor/youporn.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 19360e273..c85fd4b5a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,7 +5,6 @@ import sys from .common import InfoExtractor from ..utils import ( - compat_str, compat_urllib_parse_urlparse, compat_urllib_request, @@ -79,14 +78,11 @@ class YouPornIE(InfoExtractor): LINK_RE = r'(?s)' links = re.findall(LINK_RE, download_list_html) - # Get link of hd video - encrypted_video_url = self._html_search_regex( - r'var encrypted(?:Quality[0-9]+)?URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', - webpage, u'encrypted_video_url') - video_url = aes_decrypt_text(encrypted_video_url, video_title, 32) - print(video_url) - assert isinstance(video_url, compat_str) - if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates + # Get link of hd video if available + mobj = re.search(r'var encryptedQuality720URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', webpage) + if mobj != None: + encrypted_video_url = mobj.group(u'encrypted_video_url') + video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') links = [video_url] + links if not links: -- cgit v1.2.3