From 56c7366547462ecec0536df58971249a8a870ddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 8 Jul 2013 15:14:27 +0200 Subject: YoutubeIE: reuse instances of InfoExtractors (closes #998) When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one. That way _real_initialize is only called once for each IE, saving time if it needs to login for example. --- youtube_dl/extractor/common.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1d98222ce..236c7b12c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -106,6 +106,11 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] -- cgit v1.2.3 From 6d3a7d03e14fcbc704bf30d305fb95c5829e55a6 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Fri, 2 Aug 2013 15:26:11 +0800 Subject: fix bug: kankan extractor not support http://vod.kankan.com/v/70/70309.shtml --- youtube_dl/extractor/kankan.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 8537ba584..445d46501 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -21,8 +21,10 @@ class KankanIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') - gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title') + surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) + gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls) + gcid = gcids[-1] video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, video_id, u'Downloading video url info') -- cgit v1.2.3 From 6624a2b07dafad4de895b4e84f4595214817518d Mon Sep 17 00:00:00 2001 From: huohuarong Date: Fri, 2 Aug 2013 17:58:46 +0800 Subject: add an extractor for tv.sohu.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sohu.py | 97 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/sohu.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c20172a53..3a08d676f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -55,6 +55,7 @@ from .redtube import RedTubeIE from .ringtv import RingTVIE from .roxwel import RoxwelIE from .sina import SinaIE +from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 000000000..830814221 --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,97 @@ +# encoding: utf-8 + +import re +import json +import time +import logging +import urllib2 + +from .common import InfoExtractor +from ..utils import compat_urllib_request + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P\d+)\.shtml.*?' + + _TEST = { + u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', + u'file': u'382479172.flv', + u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b', + u'info_dict': { + u'title': u'The Illest - Far East Movement Riff Raff', + }, + } + + def _clearn_html(self, string): + tags = re.findall(r'<.+?>', string) + for t in tags: + string = string.replace(t, ' ') + for i in range(2): + spaces = re.findall(r'\s+', string) + for s in spaces: + string = string.replace(s, ' ') + string = string.strip() + return string + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + pattern = r'

\n*?(.+?)\n*?

' + compiled = re.compile(pattern, re.DOTALL) + title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') + title = self._clearn_html(title) + pattern = re.compile(r'var vid="(\d+)"') + result = re.search(pattern, webpage) + if not result: + logging.info('[Sohu] could not get vid') + return None + vid = result.group(1) + logging.info('vid: %s' % vid) + base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + url_1 = base_url_1 + vid + logging.info('json url: %s' % url_1) + json_1 = json.loads(urllib2.urlopen(url_1).read()) + # get the highest definition video vid and json infomation. + vids = [] + qualities = ('oriVid', 'superVid', 'highVid', 'norVid') + for vid_name in qualities: + vids.append(json_1['data'][vid_name]) + clearest_vid = 0 + for i, v in enumerate(vids): + if v != 0: + clearest_vid = v + logging.info('quality definition: %s' % qualities[i][:-3]) + break + if not clearest_vid: + logging.warning('could not find valid clearest_vid') + return None + if vid != clearest_vid: + url_1 = '%s%d' % (base_url_1, clearest_vid) + logging.info('highest definition json url: %s' % url_1) + json_1 = json.loads(urllib2.urlopen(url_1).read()) + allot = json_1['allot'] + prot = json_1['prot'] + clipsURL = json_1['data']['clipsURL'] + su = json_1['data']['su'] + num_of_parts = json_1['data']['totalBlocks'] + logging.info('Total parts: %d' % num_of_parts) + base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' + files_info = [] + for i in range(num_of_parts): + middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) + logging.info('middle url part %d: %s' % (i, middle_url)) + middle_info = urllib2.urlopen(middle_url).read().split('|') + middle_part_1 = middle_info[0] + download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3]) + + info = { + 'id': '%s_part%02d' % (video_id, i + 1), + 'title': title, + 'url': download_url, + 'ext': 'mp4', + } + files_info.append(info) + time.sleep(1) + + return files_info -- cgit v1.2.3 From 4ec929dc9b55a2588b4a27e64871c5bfa900bf37 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Sat, 3 Aug 2013 10:29:58 +0800 Subject: use ..utils/clean_html() --- youtube_dl/extractor/sohu.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 830814221..cf0ab5478 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -7,7 +7,7 @@ import logging import urllib2 from .common import InfoExtractor -from ..utils import compat_urllib_request +from ..utils import compat_urllib_request, clean_html class SohuIE(InfoExtractor): @@ -22,16 +22,6 @@ class SohuIE(InfoExtractor): }, } - def _clearn_html(self, string): - tags = re.findall(r'<.+?>', string) - for t in tags: - string = string.replace(t, ' ') - for i in range(2): - spaces = re.findall(r'\s+', string) - for s in spaces: - string = string.replace(s, ' ') - string = string.strip() - return string def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -40,7 +30,7 @@ class SohuIE(InfoExtractor): pattern = r'

\n*?(.+?)\n*?

' compiled = re.compile(pattern, re.DOTALL) title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') - title = self._clearn_html(title) + title = clean_html(title) pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -93,5 +83,8 @@ class SohuIE(InfoExtractor): } files_info.append(info) time.sleep(1) - + if num_of_parts == 1: + info = files_info[0] + info['id'] = video_id + return info return files_info -- cgit v1.2.3 From b5a6d408181c118bf51382f486a2492643ed74ec Mon Sep 17 00:00:00 2001 From: huohuarong Date: Mon, 5 Aug 2013 22:51:54 +0800 Subject: fix parse title bug --- youtube_dl/extractor/sohu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index cf0ab5478..cd049b6f0 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -27,10 +27,10 @@ class SohuIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - pattern = r'

\n*?(.+?)\n*?

' + pattern = r'(.+?)' compiled = re.compile(pattern, re.DOTALL) - title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') - title = clean_html(title) + title = self._search_regex(compiled, webpage, u'video title') + title = clean_html(title).split('-')[0].strip() pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -41,7 +41,8 @@ class SohuIE(InfoExtractor): base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' url_1 = base_url_1 + vid logging.info('json url: %s' % url_1) - json_1 = json.loads(urllib2.urlopen(url_1).read()) + webpage = self._download_webpage(url_1, vid) + json_1 = json.loads(webpage) # get the highest definition video vid and json infomation. vids = [] qualities = ('oriVid', 'superVid', 'highVid', 'norVid') -- cgit v1.2.3 From 461cead4f788f6a69902f350b9143a5e1588b57d Mon Sep 17 00:00:00 2001 From: tsantala Date: Tue, 6 Aug 2013 04:34:24 +0300 Subject: changes --- youtube_dl/extractor/AddAnime.py | 54 ++++++++++++++++++++++++++++++++++++++++ youtube_dl/extractor/__init__.py | 2 ++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/AddAnime.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py new file mode 100644 index 000000000..43b0b24fe --- /dev/null +++ b/youtube_dl/extractor/AddAnime.py @@ -0,0 +1,54 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) +from bs4 import BeautifulSoup + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^(?:http?://)?(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'137499050692ced.flv', + u'md5': u'0813c2430bea7a46bf13acf3406992f4', + u'info_dict': { + u"description": u"One Piece 606", + u"uploader": u"mugiwaraQ8", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r'var normal_video_file = "(.*?)",', + webpage, u'video URL') + + video_title = self._og_search_title(webpage) + + video_description = self._og_search_description(webpage) + + soup = BeautifulSoup(webpage) + + video_uploader= soup.find("meta", {"author":""})['content'] + + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader + } + + return [info] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..28dcb2cc4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ + +from .AddAnime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE -- cgit v1.2.3 From d5b00ee6e0ba70fd5d87752e8772fc1c39e4bd59 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Tue, 6 Aug 2013 10:26:57 +0800 Subject: improve sohu extractor --- youtube_dl/extractor/sohu.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index cd049b6f0..24fc3a5d7 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -31,6 +31,7 @@ class SohuIE(InfoExtractor): compiled = re.compile(pattern, re.DOTALL) title = self._search_regex(compiled, webpage, u'video title') title = clean_html(title).split('-')[0].strip() + self.to_screen('Title: %s' % title) pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -70,6 +71,7 @@ class SohuIE(InfoExtractor): base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' files_info = [] for i in range(num_of_parts): + self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts)) middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) logging.info('middle url part %d: %s' % (i, middle_url)) middle_info = urllib2.urlopen(middle_url).read().split('|') -- cgit v1.2.3 From 97b3656c2e37e45d556816b8f1f15c20d14f1acd Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Fri, 9 Aug 2013 18:37:33 +0200 Subject: YoupornIE: Add support for hd videos and update Test --- youtube_dl/extractor/youporn.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d1156bf42..cc9c37027 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -12,14 +12,16 @@ from ..utils import ( unescapeHTML, unified_strdate, ) - +from ..aes import ( + aes_decrypt_text +) class YouPornIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' _TEST = { u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', u'file': u'505835.mp4', - u'md5': u'c37ddbaaa39058c76a7e86c6813423c1', + u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', u'info_dict': { u"upload_date": u"20101221", u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", @@ -75,6 +77,14 @@ class YouPornIE(InfoExtractor): # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) + + # Get link of hd video + encrypted_video_url = self._html_search_regex(r'var encryptedURL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', + webpage, u'encrypted_video_url') + video_url = unicode( aes_decrypt_text(encrypted_video_url, video_title, 32), 'utf-8') + if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates + links = [video_url] + links + if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') -- cgit v1.2.3 From 5a27ecdd2ec83ba6e1069428c4c0fb3bd61f638c Mon Sep 17 00:00:00 2001 From: kkalpakloglou <kkalpakloglou@yahoo.com> Date: Fri, 16 Aug 2013 23:54:09 +0300 Subject: Update AddAnime.py --- youtube_dl/extractor/AddAnime.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py index 43b0b24fe..a312fa97e 100644 --- a/youtube_dl/extractor/AddAnime.py +++ b/youtube_dl/extractor/AddAnime.py @@ -1,11 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) -from bs4 import BeautifulSoup - class AddAnimeIE(InfoExtractor): @@ -17,7 +12,6 @@ class AddAnimeIE(InfoExtractor): u'md5': u'0813c2430bea7a46bf13acf3406992f4', u'info_dict': { u"description": u"One Piece 606", - u"uploader": u"mugiwaraQ8", u"title": u"One Piece 606" } } @@ -31,24 +25,27 @@ class AddAnimeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'var normal_video_file = "(.*?)",', - webpage, u'video URL') + + def find_between( webpage, first, last ): + try: + start = webpage.index( first ) + len( first ) + end = webpage.index( last, start ) + return webpage[start:end] + except ValueError: + return "" + + video_url = find_between( webpage, "var normal_video_file = '", "';" ) video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) - - soup = BeautifulSoup(webpage) - - video_uploader= soup.find("meta", {"author":""})['content'] info = { 'id': video_id, 'url': video_url, 'ext': 'flv', 'title': video_title, - 'description': video_description, - 'uploader': video_uploader + 'description': video_description } return [info] -- cgit v1.2.3 From 943f7f7a399c6fb3006eb2bd68070f28a272171f Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <pierre@rudloff.pro> Date: Sun, 18 Aug 2013 16:11:47 +0200 Subject: Download videos from jeuxvideo.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/jeuxvideo.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/jeuxvideo.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..b9bd3a429 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py new file mode 100644 index 000000000..d74a1c9b4 --- /dev/null +++ b/youtube_dl/extractor/jeuxvideo.py @@ -0,0 +1,33 @@ +import json +import re + +from .common import InfoExtractor + +class JeuxVideoIE(InfoExtractor): + _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + m_download = re.search(r'<param name="flashvars" value="config=(.*?)" />', webpage) + + xml_link = m_download.group(1) + + id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + + xml_config = self._download_webpage(xml_link, title, + 'Downloading XML config') + info = re.search(r'<format\.json>(.*?)</format\.json>', + xml_config, re.MULTILINE|re.DOTALL).group(1) + info = json.loads(info)['versions'][0] + + video_url = 'http://video720.jeuxvideo.com/' + info['file'] + + track_info = {'id':id, + 'title' : title, + 'ext' : 'mp4', + 'url' : video_url + } + + return [track_info] -- cgit v1.2.3 From 7070b83687ed134af6d9a71bbf2ec759a56965d5 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <pierre@rudloff.pro> Date: Thu, 22 Aug 2013 12:54:17 +0200 Subject: Merge remote-tracking branch 'upstream/master' --- youtube_dl/extractor/jeuxvideo.py | 1 - 1 file changed, 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index c8a8ae1b3..4327bc13d 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -31,7 +31,6 @@ class JeuxVideoIE(InfoExtractor): xml_config = self._download_webpage(xml_link, title, 'Downloading XML config') - config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) info = re.search(r'<format\.json>(.*?)</format\.json>', xml_config, re.MULTILINE|re.DOTALL).group(1) -- cgit v1.2.3 From cd0abcc0bb4c218fd02850a139b626d252e22599 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <contact@rudloff.pro> Date: Thu, 22 Aug 2013 13:54:23 +0200 Subject: Extractor for canalc2.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/canalc2.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/canalc2.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9d12608e1..576b8433a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -7,6 +7,7 @@ from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..d0e2ed536 --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,37 @@ +# coding: utf-8 +"""Extractor for canalc2.tv""" +import re +import lxml.html + +from .common import InfoExtractor + +class Canalc2IE(InfoExtractor): + """Extractor for canalc2.tv""" + _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + + _TEST = { + u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + u'file': u'12163.mp4', + u'md5': u'c00fa80517373764ff5c0b5eb5a58780', + u'info_dict': { + u'title': u'Terrasses du Numérique' + } + } + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, video_id) + file_name = re.search(r"so\.addVariable\('file','(.*?)'\);", + webpage).group(1) + + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + + html = lxml.html.fromstring(webpage) + + title = html.cssselect('.evenement8')[0].text_content() + + return {'id': video_id, + 'ext' : 'mp4', + 'url' : video_url, + 'title' : title + } -- cgit v1.2.3 From ff2424595adf02cbe5d1f1071e53c3b2e5f32c9e Mon Sep 17 00:00:00 2001 From: Pierre Rudloff <contact@rudloff.pro> Date: Thu, 22 Aug 2013 14:47:51 +0200 Subject: lxml is not part of the standard library. --- youtube_dl/extractor/canalc2.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index d0e2ed536..215abf537 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,7 +1,6 @@ # coding: utf-8 """Extractor for canalc2.tv""" import re -import lxml.html from .common import InfoExtractor @@ -25,10 +24,9 @@ class Canalc2IE(InfoExtractor): webpage).group(1) video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name - - html = lxml.html.fromstring(webpage) - - title = html.cssselect('.evenement8')[0].text_content() + + title = self._html_search_regex(r'class="evenement8">(.*?)</a>', + webpage, u'title') return {'id': video_id, 'ext' : 'mp4', -- cgit v1.2.3 From 341ca8d74c8f090bd696111353400f0cef2ba9bc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 27 Aug 2013 01:59:00 +0200 Subject: [trilulilu] Add support for trilulilu.ro Fun fact: The ads (not yet supported) are loaded from youtube ;) --- youtube_dl/extractor/__init__.py | 5 +-- youtube_dl/extractor/trilulilu.py | 76 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 youtube_dl/extractor/trilulilu.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f71ae2713..fa53d9af9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .c56 import C56IE from .canalplus import CanalplusIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE @@ -73,18 +74,18 @@ from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE -from .ustream import UstreamIE from .unistra import UnistraIE +from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE -from .c56 import C56IE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py new file mode 100644 index 000000000..1c46156c7 --- /dev/null +++ b/youtube_dl/extractor/trilulilu.py @@ -0,0 +1,76 @@ +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class TriluliluIE(InfoExtractor): + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1", + u'file': u"big-buck-bunny-1.mp4", + u'info_dict': { + u"title": u"Big Buck Bunny", + u"description": u":) pentru copilul din noi", + }, + # Server ignores Range headers (--test) + u"params": { + u"skip_download": True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + log_str = self._search_regex( + r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info') + log = json.loads(log_str) + + format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' + u'video-formats2' % log) + format_str = self._download_webpage( + format_url, video_id, + note=u'Downloading formats', + errnote=u'Error while downloading formats') + + format_doc = xml.etree.ElementTree.fromstring(format_str) + + video_url_template = ( + u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' + u'&source=site&hash=%(hash)s&username=%(userid)s&' + u'key=ministhebest&format=%%s&sig=&exp=' % + log) + formats = [ + { + 'format': fnode.text, + 'url': video_url_template % fnode.text, + } + + for fnode in format_doc.findall('./formats/format') + ] + + info = { + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['format'].partition('-')[0] + + return info -- cgit v1.2.3 From 069d098f846ca53073ec646f335f77dac4439844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 10:21:57 +0200 Subject: [canalplus] Accept player.canalplus.fr urls --- youtube_dl/extractor/canalplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' -- cgit v1.2.3 From 2a7b4da9b2ee11e88976e0e93796fd8460aa053d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 10:25:38 +0200 Subject: [hark] get the song info in JSON and extract more information. --- youtube_dl/extractor/hark.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index ab0a69697..5bdd08afa 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +import json from .common import InfoExtractor from ..utils import determine_ext @@ -12,24 +13,25 @@ class HarkIE(InfoExtractor): u'file': u'mmbzyhkgny.mp3', u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', u'info_dict': { - u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", + u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + u'duration': 11, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) - webpage = self._download_webpage(embed_url, video_id) - - final_url = self._search_regex(r'src="(.+?).mp3"', - webpage, 'video url')+'.mp3' - title = self._html_search_regex(r'<title>(.+?)', - webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( - 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + json_url = "http://www.hark.com/clips/%s.json" %(video_id) + info_json = self._download_webpage(json_url, video_id) + info = json.loads(info_json) + final_url = info['url'] return {'id': video_id, 'url' : final_url, - 'title': title, + 'title': info['name'], 'ext': determine_ext(final_url), + 'description': info['description'], + 'thumbnail': info['image_original'], + 'duration': info['duration'], } -- cgit v1.2.3 From e86ea47c029c1f95a696e43df7bea2e3e617fbc3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Aug 2013 10:35:20 +0200 Subject: [canalc2] Small improvements --- youtube_dl/extractor/canalc2.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 215abf537..50832217a 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,17 +1,17 @@ # coding: utf-8 -"""Extractor for canalc2.tv""" import re from .common import InfoExtractor + class Canalc2IE(InfoExtractor): - """Extractor for canalc2.tv""" + _IE_NAME = 'canalc2.tv' _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' _TEST = { u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', u'file': u'12163.mp4', - u'md5': u'c00fa80517373764ff5c0b5eb5a58780', + u'md5': u'060158428b650f896c542dfbb3d6487f', u'info_dict': { u'title': u'Terrasses du Numérique' } @@ -20,16 +20,16 @@ class Canalc2IE(InfoExtractor): def _real_extract(self, url): video_id = re.match(self._VALID_URL, url).group(1) webpage = self._download_webpage(url, video_id) - file_name = re.search(r"so\.addVariable\('file','(.*?)'\);", - webpage).group(1) - + file_name = self._search_regex( + r"so\.addVariable\('file','(.*?)'\);", + webpage, 'file name') video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name - title = self._html_search_regex(r'class="evenement8">(.*?)', - webpage, u'title') + title = self._html_search_regex( + r'class="evenement8">(.*?)', webpage, u'title') return {'id': video_id, - 'ext' : 'mp4', - 'url' : video_url, - 'title' : title + 'ext': 'mp4', + 'url': video_url, + 'title': title, } -- cgit v1.2.3 From 1a582dd49d628914fa6a056b490914738f15c56d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 27 Aug 2013 11:56:48 +0200 Subject: Add an extractor for CNN (closes #1318) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cnn.py | 47 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/cnn.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eeeb3db50..ea2af0d0e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..cee78765b --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,47 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + +class CNNIE(InfoExtractor): + _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P.+?/(?P[^/]+?)\.cnn)' + + _TEST = { + u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', + u'md5': u'3e6121ea48df7e2259fe73a0628605c4', + u'info_dict': { + u'title': u'Nadal wins 8th French Open title', + u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + page_title = mobj.group('title') + info_xml = self._download_webpage( + 'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + formats = [] + for f in info.findall('files/file'): + mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) + if mf is not None: + formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) + formats = sorted(formats) + (_,_,_, video_path) = formats[-1] + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + + thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) + thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + + return {'id': info.attrib['id'], + 'title': info.find('headline').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + } -- cgit v1.2.3 From 0bc56fa66a4b0f1b6bf827bd3550a119d3e3b231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 27 Aug 2013 12:38:30 +0200 Subject: Add an extractor for NBC news (closes #1320) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nbc.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/nbc.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ea2af0d0e..27bbcc0f7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .nbc import NBCNewsIE from .ooyala import OoyalaIE from .pbs import PBSIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', + u'file': u'52753292.flv', + u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', + u'info_dict': { + u'title': u'Crew emerges after four-month Mars food study', + u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + + return {'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } -- cgit v1.2.3 From 7f3c4f4f65ddb4f8374b31b74428780e60a373de Mon Sep 17 00:00:00 2001 From: Jeff Smith <whydoubt@yahoo.com> Date: Tue, 27 Aug 2013 14:38:50 -0500 Subject: Initial slash in Google+ photos link was removed --- youtube_dl/extractor/googleplus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 9f7fc19a4..f1cd88983 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com' - video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), + DOMAIN = 'https://plus.google.com/' + video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), webpage, u'video page URL') if not video_page.startswith(DOMAIN): video_page = DOMAIN + video_page -- cgit v1.2.3 From 273f603efb2028a54e04cca314b72bc2a9d767ef Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 00:14:19 +0200 Subject: [cnn] Allow more URLs --- youtube_dl/extractor/cnn.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index cee78765b..4338bd180 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -4,10 +4,12 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext + class CNNIE(InfoExtractor): - _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P<path>.+?/(?P<title>[^/]+?)\.cnn)' + _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' - _TEST = { + _TESTS = [{ u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', u'md5': u'3e6121ea48df7e2259fe73a0628605c4', @@ -15,14 +17,24 @@ class CNNIE(InfoExtractor): u'title': u'Nadal wins 8th French Open title', u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', }, - } + }, + { + u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", + u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", + u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", + u"info_dict": { + u"title": "Student's epic speech stuns new freshmen", + u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) path = mobj.group('path') page_title = mobj.group('title') - info_xml = self._download_webpage( - 'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title) + info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path + print(info_url) + info_xml = self._download_webpage(info_url, page_title) info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) formats = [] -- cgit v1.2.3 From 44586389e4676dfd926255cf76e36684dcf4742d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 02:18:44 +0200 Subject: [appletrailers] Add support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/appletrailers.py | 167 ++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 youtube_dl/extractor/appletrailers.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 27bbcc0f7..2f86f2aca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,4 @@ +from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..7d126e2d2 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,167 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, +) + + +class AppleTrailersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TEST = { + u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", + u"playlist": [ + { + u"file": u"manofsteel-trailer4.mov", + u"md5": u"11874af099d480cc09e103b189805d5f", + u"info_dict": { + u"duration": 111, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", + u"title": u"Trailer 4", + u"upload_date": u"20130523", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer3.mov", + u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"info_dict": { + u"duration": 182, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", + u"title": u"Trailer 3", + u"upload_date": u"20130417", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer.mov", + u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"info_dict": { + u"duration": 148, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", + u"title": u"Trailer", + u"upload_date": u"20121212", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-teaser.mov", + u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"info_dict": { + u"duration": 93, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", + u"title": u"Teaser", + u"upload_date": u"20120721", + u"uploader_id": u"wb", + }, + } + ] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_snippet = self._download_webpage(playlist_url, movie) + playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) + playlist_html = u'<html>' + playlist_cleaned + u'</html>' + + size_cache = {} + + doc = xml.etree.ElementTree.fromstring(playlist_html) + playlist = [] + for li in doc.findall('./div/ul/li'): + title = li.find('.//h3').text + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + + date_el = li.find('.//p') + upload_date = None + m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) + if m: + upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') + runtime_el = date_el.find('./br') + m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + formats = [] + for formats_el in li.findall('.//li/a'): + if formats_el.attrib['class'] != 'OverlayPanel': + continue + target = formats_el.attrib['target'] + + format_code = formats_el.text + if 'Automatic' in format_code: + continue + + size_q = formats_el.attrib['href'] + size_id = size_q.rpartition('#videos-')[2] + if size_id not in size_cache: + size_url = url + size_q + sizepage_html = self._download_webpage( + size_url, movie, + note=u'Downloading size info %s' % size_id, + errnote=u'Error while downloading size info %s' % size_id, + ) + _doc = xml.etree.ElementTree.fromstring(sizepage_html) + size_cache[size_id] = _doc + + sizepage_doc = size_cache[size_id] + links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') + for vid_a in links: + href = vid_a.get('href') + if not href.endswith(target): + continue + detail_q = href.partition('#')[0] + detail_url = url + '/' + detail_q + + m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) + detail_id = m.group('detail_id') + + detail_html = self._download_webpage( + detail_url, movie, + note=u'Downloading detail %s %s' % (detail_id, size_id), + errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) + ) + detail_doc = xml.etree.ElementTree.fromstring(detail_html) + movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') + assert movie_link_el.get('class') == 'movieLink' + movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') + ext = determine_ext(movie_link) + assert ext == 'mov' + + formats.append({ + 'format': format_code, + 'ext': ext, + 'url': movie_link, + }) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'user_agent': 'QuickTime compatible (youtube-dl)', + } + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['ext'] + + playlist.append(info) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } -- cgit v1.2.3 From a1bb0f8773e0fff787ffe7bd1729073f3385d2ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:20:37 +0200 Subject: [cnn] remove debug print call. --- youtube_dl/extractor/cnn.py | 1 - 1 file changed, 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 4338bd180..a79f881cd 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -33,7 +33,6 @@ class CNNIE(InfoExtractor): path = mobj.group('path') page_title = mobj.group('title') info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path - print(info_url) info_xml = self._download_webpage(info_url, page_title) info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) -- cgit v1.2.3 From 3e223834d9f358bc7cb1c3748dc63d1ab40d9b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 10:26:44 +0200 Subject: [youtube] update algo for length 88, thanks to @Ramhack (fixes #1328) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index af01c9da0..8e486afd0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 89: return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: -- cgit v1.2.3 From 4f5f18acb93ea2bf70f80c7f76e6bb6b8dee3fbf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 10:28:16 +0200 Subject: [addanime] add file --- youtube_dl/extractor/addanime.py | 76 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 youtube_dl/extractor/addanime.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..46db8262f --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,76 @@ +import ast +import re + +from .common import InfoExtractor +from ..utils import ( + compat_HTTPError, + compat_str, + compat_urllib_parse, + compat_urllib_parse_urlparse, + + ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'24MR3YO5SAS9.flv', + u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'info_dict': { + u"description": u"One Piece 606", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + try: + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + + redir_webpage = ee.cause.read().decode('utf-8') + action = self._search_regex( + r'<form id="challenge-form" action="([^"]+)"', + redir_webpage, u'Redirect form') + vc = self._search_regex( + r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', + redir_webpage, u'redirect vc value') + av = re.search( + r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', + redir_webpage) + if av is None: + raise ExtractorError(u'Cannot find redirect math task') + av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + + parsed_url = compat_urllib_parse_urlparse(url) + av_val = av_res + len(parsed_url.netloc) + confirm_url = ( + parsed_url.scheme + u'://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse.urlencode({ + 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) + self._download_webpage( + confirm_url, video_id, + note=u'Confirming after redirect') + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description + } -- cgit v1.2.3 From ae3531adf926998d42d1fb52453491c85e33b5f0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:04:44 +0200 Subject: [generic] Fix URL concatenation When the url is something like http://example.org/foo/bar?x=y and the added is file/video.mp4 , we want http://example.org/foo/file/video.mp4 Fixes #1268. --- youtube_dl/extractor/generic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d034a11bb..bfc9bff49 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -166,7 +166,12 @@ class GenericIE(InfoExtractor): if video_url.startswith('//'): video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url if '://' not in video_url: - video_url = url + ('' if url.endswith('/') else '/') + video_url + up = compat_urllib_parse_urlparse(url) + if video_url.startswith('/'): + video_url = up.scheme + '://' + up.netloc + video_url + else: # relative path + video_url = (up.scheme + '://' + up.netloc + + up.path.rpartition('/')[0] + '/' + video_url) video_id = os.path.basename(video_url) # here's a fun little line of code for you: -- cgit v1.2.3 From a5caba1eb02665cdc982d6be4a933aafd79243de Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:47:27 +0200 Subject: [generic] simply use urljoin --- youtube_dl/extractor/generic.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bfc9bff49..dc4dea4ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_error, compat_urllib_parse, - compat_urllib_parse_urlparse, compat_urllib_request, + compat_urlparse, ExtractorError, ) @@ -163,15 +163,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_url = compat_urllib_parse.unquote(mobj.group(1)) - if video_url.startswith('//'): - video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url - if '://' not in video_url: - up = compat_urllib_parse_urlparse(url) - if video_url.startswith('/'): - video_url = up.scheme + '://' + up.netloc + video_url - else: # relative path - video_url = (up.scheme + '://' + up.netloc + - up.path.rpartition('/')[0] + '/' + video_url) + video_url = compat_urlparse.urljoin(url, video_url) video_id = os.path.basename(video_url) # here's a fun little line of code for you: -- cgit v1.2.3 From ce6a696e4d964aeb27de46a31a899b28d7ca7754 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 12:47:38 +0200 Subject: Remove unused imports --- youtube_dl/extractor/addanime.py | 1 - youtube_dl/extractor/appletrailers.py | 1 - youtube_dl/extractor/trilulilu.py | 3 --- youtube_dl/extractor/wat.py | 1 - 4 files changed, 6 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 46db8262f..82a785a19 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -1,4 +1,3 @@ -import ast import re from .common import InfoExtractor diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index b3bdb2955..8b191c196 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -4,7 +4,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( determine_ext, - ExtractorError, ) diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 1c46156c7..f278951ba 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -3,9 +3,6 @@ import re import xml.etree.ElementTree from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class TriluliluIE(InfoExtractor): diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7d228edac..29c25f0e3 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, unified_strdate, ) -- cgit v1.2.3 From 67b22dd03686d9e360d87a7751de74b321d3f231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Aug 2013 12:51:22 +0200 Subject: Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mit.py | 76 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 youtube_dl/extractor/mit.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c76b99a81..21e9e5d37 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -50,6 +50,7 @@ from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..d09d03e36 --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,76 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = u'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' + + _TEST = { + u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + u'file': u'25418.mp4', + u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', + u'info_dict': { + u'title': u'MIT DNA Learning Center Set', + u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + embed_page = self._download_webpage( + 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, + note=u'Downloading embed page') + + base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', + embed_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + u'video formats') + formats = json.loads(formats_json) + formats = sorted(formats, key=lambda f: f['bitrate']) + + title = get_element_by_id('edit-title', webpage) + description = clean_html(get_element_by_id('edit-description', webpage)) + thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', + embed_page, u'thumbnail', flags=re.DOTALL) + + return {'id': video_id, + 'title': title, + 'url': base_url + formats[-1]['url'].replace('mp4:', ''), + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumbnail, + } + + +class MITIE(TechTVMITIE): + IE_NAME = u'video.mit.edu' + _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' + + _TEST = { + u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + u'file': u'21783.mp4', + u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', + u'info_dict': { + u'title': u'The Government is Profiling You', + u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) + embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, + u'embed url') + return self.url_result(embed_url, ie='TechTVMIT') -- cgit v1.2.3 From f143d86ad2fc0633d8e2da598cf21e73ff0f2872 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 28 Aug 2013 13:59:08 +0200 Subject: [sohu] Handle encoding, and fix tests --- youtube_dl/extractor/common.py | 9 ++- youtube_dl/extractor/sohu.py | 131 ++++++++++++++++++++--------------------- 2 files changed, 71 insertions(+), 69 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77a13aea5..a2986cebe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -145,12 +145,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 24fc3a5d7..77bb0a8dc 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -1,13 +1,10 @@ # encoding: utf-8 -import re import json -import time -import logging -import urllib2 +import re from .common import InfoExtractor -from ..utils import compat_urllib_request, clean_html +from ..utils import ExtractorError class SohuIE(InfoExtractor): @@ -15,79 +12,79 @@ class SohuIE(InfoExtractor): _TEST = { u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', - u'file': u'382479172.flv', - u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b', + u'file': u'382479172.mp4', + u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', u'info_dict': { - u'title': u'The Illest - Far East Movement Riff Raff', + u'title': u'MV:Far East Movement《The Illest》', }, } - def _real_extract(self, url): + + def _fetch_data(vid_id): + base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + data_url = base_data_url + str(vid_id) + data_json = self._download_webpage( + data_url, video_id, + note=u'Downloading JSON data for ' + str(vid_id)) + return json.loads(data_json) + mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) - pattern = r'<title>(.+?)' - compiled = re.compile(pattern, re.DOTALL) - title = self._search_regex(compiled, webpage, u'video title') - title = clean_html(title).split('-')[0].strip() - self.to_screen('Title: %s' % title) - pattern = re.compile(r'var vid="(\d+)"') - result = re.search(pattern, webpage) - if not result: - logging.info('[Sohu] could not get vid') - return None - vid = result.group(1) - logging.info('vid: %s' % vid) - base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - url_1 = base_url_1 + vid - logging.info('json url: %s' % url_1) - webpage = self._download_webpage(url_1, vid) - json_1 = json.loads(webpage) - # get the highest definition video vid and json infomation. - vids = [] - qualities = ('oriVid', 'superVid', 'highVid', 'norVid') - for vid_name in qualities: - vids.append(json_1['data'][vid_name]) - clearest_vid = 0 - for i, v in enumerate(vids): - if v != 0: - clearest_vid = v - logging.info('quality definition: %s' % qualities[i][:-3]) - break - if not clearest_vid: - logging.warning('could not find valid clearest_vid') - return None - if vid != clearest_vid: - url_1 = '%s%d' % (base_url_1, clearest_vid) - logging.info('highest definition json url: %s' % url_1) - json_1 = json.loads(urllib2.urlopen(url_1).read()) - allot = json_1['allot'] - prot = json_1['prot'] - clipsURL = json_1['data']['clipsURL'] - su = json_1['data']['su'] - num_of_parts = json_1['data']['totalBlocks'] - logging.info('Total parts: %d' % num_of_parts) - base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]' - files_info = [] - for i in range(num_of_parts): - self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts)) - middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i]) - logging.info('middle url part %d: %s' % (i, middle_url)) - middle_info = urllib2.urlopen(middle_url).read().split('|') - middle_part_1 = middle_info[0] - download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3]) + raw_title = self._html_search_regex(r'(?s)(.+?)', + webpage, u'video title') + title = raw_title.partition('-')[0].strip() - info = { + vid = self._html_search_regex(r'var vid="(\d+)"', webpage, + u'video path') + data = _fetch_data(vid) + + QUALITIES = ('ori', 'super', 'high', 'nor') + vid_ids = [data['data'][q + 'Vid'] + for q in QUALITIES + if data['data'][q + 'Vid'] != 0] + if not vid_ids: + raise ExtractorError(u'No formats available for this video') + + # For now, we just pick the highest available quality + vid_id = vid_ids[-1] + + format_data = data if vid == vid_id else _fetch_data(vid_id) + part_count = format_data['data']['totalBlocks'] + allot = format_data['allot'] + prot = format_data['prot'] + clipsURL = format_data['data']['clipsURL'] + su = format_data['data']['su'] + + playlist = [] + for i in range(part_count): + part_url = ('http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clipsURL[i], su[i])) + part_str = self._download_webpage( + part_url, video_id, + note=u'Downloading part %d of %d' % (i+1, part_count)) + + part_info = part_str.split('|') + video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + video_info = { 'id': '%s_part%02d' % (video_id, i + 1), 'title': title, - 'url': download_url, + 'url': video_url, 'ext': 'mp4', } - files_info.append(info) - time.sleep(1) - if num_of_parts == 1: - info = files_info[0] + playlist.append(video_info) + + if len(playlist) == 1: + info = playlist[0] info['id'] = video_id - return info - return files_info + else: + info = { + '_type': 'playlist', + 'entries': playlist, + 'id': video_id, + } + + return info -- cgit v1.2.3 From 48ea9cea77e7ea24ee867027f03ca37dd1b935d8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Aug 2013 14:28:55 +0200 Subject: Allow changes to run under Python 3 --- youtube_dl/extractor/youporn.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index cc9c37027..19360e273 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,6 +5,7 @@ import sys from .common import InfoExtractor from ..utils import ( + compat_str, compat_urllib_parse_urlparse, compat_urllib_request, @@ -79,13 +80,16 @@ class YouPornIE(InfoExtractor): links = re.findall(LINK_RE, download_list_html) # Get link of hd video - encrypted_video_url = self._html_search_regex(r'var encryptedURL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', + encrypted_video_url = self._html_search_regex( + r'var encrypted(?:Quality[0-9]+)?URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', webpage, u'encrypted_video_url') - video_url = unicode( aes_decrypt_text(encrypted_video_url, video_title, 32), 'utf-8') + video_url = aes_decrypt_text(encrypted_video_url, video_title, 32) + print(video_url) + assert isinstance(video_url, compat_str) if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates links = [video_url] + links - if(len(links) == 0): + if not links: raise ExtractorError(u'ERROR: no known formats available for video') self.to_screen(u'Links found: %d' % len(links)) @@ -122,7 +126,7 @@ class YouPornIE(InfoExtractor): self._print_formats(formats) return - req_format = self._downloader.params.get('format', None) + req_format = self._downloader.params.get('format', 'best') self.to_screen(u'Format: %s' % req_format) if req_format is None or req_format == 'best': -- cgit v1.2.3 From 878e83c5a4c84c7abbf3484366e76fbe906c8947 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Wed, 28 Aug 2013 16:04:48 +0200 Subject: YoupornIE: Clean up extraction of hd video --- youtube_dl/extractor/youporn.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 19360e273..c85fd4b5a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,7 +5,6 @@ import sys from .common import InfoExtractor from ..utils import ( - compat_str, compat_urllib_parse_urlparse, compat_urllib_request, @@ -79,14 +78,11 @@ class YouPornIE(InfoExtractor): LINK_RE = r'(?s)' links = re.findall(LINK_RE, download_list_html) - # Get link of hd video - encrypted_video_url = self._html_search_regex( - r'var encrypted(?:Quality[0-9]+)?URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', - webpage, u'encrypted_video_url') - video_url = aes_decrypt_text(encrypted_video_url, video_title, 32) - print(video_url) - assert isinstance(video_url, compat_str) - if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates + # Get link of hd video if available + mobj = re.search(r'var encryptedQuality720URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', webpage) + if mobj != None: + encrypted_video_url = mobj.group(u'encrypted_video_url') + video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') links = [video_url] + links if not links: -- cgit v1.2.3