diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/addanime.py | 75 | ||||
-rw-r--r-- | youtube_dl/extractor/appletrailers.py | 166 | ||||
-rw-r--r-- | youtube_dl/extractor/canalc2.py | 35 | ||||
-rw-r--r-- | youtube_dl/extractor/canalplus.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/cnn.py | 58 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 16 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/googleplus.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/hark.py | 22 | ||||
-rw-r--r-- | youtube_dl/extractor/kankan.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/mit.py | 76 | ||||
-rw-r--r-- | youtube_dl/extractor/nbc.py | 33 | ||||
-rw-r--r-- | youtube_dl/extractor/sohu.py | 90 | ||||
-rw-r--r-- | youtube_dl/extractor/trilulilu.py | 73 | ||||
-rw-r--r-- | youtube_dl/extractor/wat.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/youporn.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 2 |
18 files changed, 665 insertions, 31 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f71ae2713..6b5037c8c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ +from .appletrailers import AppleTrailersIE +from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE @@ -6,7 +8,10 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .c56 import C56IE from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE @@ -45,12 +50,14 @@ from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .nbc import NBCNewsIE from .ooyala import OoyalaIE from .pbs import PBSIE from .photobucket import PhotobucketIE @@ -63,6 +70,7 @@ from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE from .slashdot import SlashdotIE +from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE @@ -73,18 +81,18 @@ from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE -from .ustream import UstreamIE from .unistra import UnistraIE +from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE -from .c56 import C56IE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..82a785a19 --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,75 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + compat_HTTPError, + compat_str, + compat_urllib_parse, + compat_urllib_parse_urlparse, + + ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'24MR3YO5SAS9.flv', + u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'info_dict': { + u"description": u"One Piece 606", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + try: + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + + redir_webpage = ee.cause.read().decode('utf-8') + action = self._search_regex( + r'<form id="challenge-form" action="([^"]+)"', + redir_webpage, u'Redirect form') + vc = self._search_regex( + r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', + redir_webpage, u'redirect vc value') + av = re.search( + r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', + redir_webpage) + if av is None: + raise ExtractorError(u'Cannot find redirect math task') + av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + + parsed_url = compat_urllib_parse_urlparse(url) + av_val = av_res + len(parsed_url.netloc) + confirm_url = ( + parsed_url.scheme + u'://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse.urlencode({ + 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) + self._download_webpage( + confirm_url, video_id, + note=u'Confirming after redirect') + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..8b191c196 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,166 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class AppleTrailersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TEST = { + u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", + u"playlist": [ + { + u"file": u"manofsteel-trailer4.mov", + u"md5": u"11874af099d480cc09e103b189805d5f", + u"info_dict": { + u"duration": 111, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", + u"title": u"Trailer 4", + u"upload_date": u"20130523", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer3.mov", + u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"info_dict": { + u"duration": 182, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", + u"title": u"Trailer 3", + u"upload_date": u"20130417", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer.mov", + u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"info_dict": { + u"duration": 148, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", + u"title": u"Trailer", + u"upload_date": u"20121212", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-teaser.mov", + u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"info_dict": { + u"duration": 93, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", + u"title": u"Teaser", + u"upload_date": u"20120721", + u"uploader_id": u"wb", + }, + } + ] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_snippet = self._download_webpage(playlist_url, movie) + playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) + playlist_html = u'<html>' + playlist_cleaned + u'</html>' + + size_cache = {} + + doc = xml.etree.ElementTree.fromstring(playlist_html) + playlist = [] + for li in doc.findall('./div/ul/li'): + title = li.find('.//h3').text + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + + date_el = li.find('.//p') + upload_date = None + m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) + if m: + upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') + runtime_el = date_el.find('./br') + m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + formats = [] + for formats_el in li.findall('.//a'): + if formats_el.attrib['class'] != 'OverlayPanel': + continue + target = formats_el.attrib['target'] + + format_code = formats_el.text + if 'Automatic' in format_code: + continue + + size_q = formats_el.attrib['href'] + size_id = size_q.rpartition('#videos-')[2] + if size_id not in size_cache: + size_url = url + size_q + sizepage_html = self._download_webpage( + size_url, movie, + note=u'Downloading size info %s' % size_id, + errnote=u'Error while downloading size info %s' % size_id, + ) + _doc = xml.etree.ElementTree.fromstring(sizepage_html) + size_cache[size_id] = _doc + + sizepage_doc = size_cache[size_id] + links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') + for vid_a in links: + href = vid_a.get('href') + if not href.endswith(target): + continue + detail_q = href.partition('#')[0] + detail_url = url + '/' + detail_q + + m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) + detail_id = m.group('detail_id') + + detail_html = self._download_webpage( + detail_url, movie, + note=u'Downloading detail %s %s' % (detail_id, size_id), + errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) + ) + detail_doc = xml.etree.ElementTree.fromstring(detail_html) + movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') + assert movie_link_el.get('class') == 'movieLink' + movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') + ext = determine_ext(movie_link) + assert ext == 'mov' + + formats.append({ + 'format': format_code, + 'ext': ext, + 'url': movie_link, + }) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'user_agent': 'QuickTime compatible (youtube-dl)', + } + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['ext'] + + playlist.append(info) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..50832217a --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + + +class Canalc2IE(InfoExtractor): + _IE_NAME = 'canalc2.tv' + _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + + _TEST = { + u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + u'file': u'12163.mp4', + u'md5': u'060158428b650f896c542dfbb3d6487f', + u'info_dict': { + u'title': u'Terrasses du Numérique' + } + } + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, video_id) + file_name = self._search_regex( + r"so\.addVariable\('file','(.*?)'\);", + webpage, 'file name') + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + + title = self._html_search_regex( + r'class="evenement8">(.*?)</a>', webpage, u'title') + + return {'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..a79f881cd --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,58 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class CNNIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' + + _TESTS = [{ + u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', + u'md5': u'3e6121ea48df7e2259fe73a0628605c4', + u'info_dict': { + u'title': u'Nadal wins 8th French Open title', + u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + }, + }, + { + u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", + u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", + u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", + u"info_dict": { + u"title": "Student's epic speech stuns new freshmen", + u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + page_title = mobj.group('title') + info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path + info_xml = self._download_webpage(info_url, page_title) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + formats = [] + for f in info.findall('files/file'): + mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) + if mf is not None: + formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) + formats = sorted(formats) + (_,_,_, video_path) = formats[-1] + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + + thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) + thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + + return {'id': info.attrib['id'], + 'title': info.find('headline').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52c4483c9..a2986cebe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -114,6 +114,11 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] @@ -129,7 +134,7 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err) def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): """ Returns a tuple (page content as string, URL handle) """ @@ -140,12 +145,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d034a11bb..dc4dea4ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_error, compat_urllib_parse, - compat_urllib_parse_urlparse, compat_urllib_request, + compat_urlparse, ExtractorError, ) @@ -163,10 +163,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_url = compat_urllib_parse.unquote(mobj.group(1)) - if video_url.startswith('//'): - video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url - if '://' not in video_url: - video_url = url + ('' if url.endswith('/') else '/') + video_url + video_url = compat_urlparse.urljoin(url, video_url) video_id = os.path.basename(video_url) # here's a fun little line of code for you: diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 9f7fc19a4..f1cd88983 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com' - video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), + DOMAIN = 'https://plus.google.com/' + video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), webpage, u'video page URL') if not video_page.startswith(DOMAIN): video_page = DOMAIN + video_page diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index ab0a69697..5bdd08afa 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +import json from .common import InfoExtractor from ..utils import determine_ext @@ -12,24 +13,25 @@ class HarkIE(InfoExtractor): u'file': u'mmbzyhkgny.mp3', u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', u'info_dict': { - u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", + u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + u'duration': 11, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) - webpage = self._download_webpage(embed_url, video_id) - - final_url = self._search_regex(r'src="(.+?).mp3"', - webpage, 'video url')+'.mp3' - title = self._html_search_regex(r'<title>(.+?)</title>', - webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( - 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + json_url = "http://www.hark.com/clips/%s.json" %(video_id) + info_json = self._download_webpage(json_url, video_id) + info = json.loads(info_json) + final_url = info['url'] return {'id': video_id, 'url' : final_url, - 'title': title, + 'title': info['name'], 'ext': determine_ext(final_url), + 'description': info['description'], + 'thumbnail': info['image_original'], + 'duration': info['duration'], } diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 8537ba584..445d46501 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -21,8 +21,10 @@ class KankanIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') - gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title') + surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) + gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls) + gcid = gcids[-1] video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, video_id, u'Downloading video url info') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..d09d03e36 --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,76 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = u'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' + + _TEST = { + u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + u'file': u'25418.mp4', + u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', + u'info_dict': { + u'title': u'MIT DNA Learning Center Set', + u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + embed_page = self._download_webpage( + 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, + note=u'Downloading embed page') + + base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', + embed_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + u'video formats') + formats = json.loads(formats_json) + formats = sorted(formats, key=lambda f: f['bitrate']) + + title = get_element_by_id('edit-title', webpage) + description = clean_html(get_element_by_id('edit-description', webpage)) + thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', + embed_page, u'thumbnail', flags=re.DOTALL) + + return {'id': video_id, + 'title': title, + 'url': base_url + formats[-1]['url'].replace('mp4:', ''), + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumbnail, + } + + +class MITIE(TechTVMITIE): + IE_NAME = u'video.mit.edu' + _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' + + _TEST = { + u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + u'file': u'21783.mp4', + u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', + u'info_dict': { + u'title': u'The Government is Profiling You', + u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) + embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, + u'embed url') + return self.url_result(embed_url, ie='TechTVMIT') diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', + u'file': u'52753292.flv', + u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', + u'info_dict': { + u'title': u'Crew emerges after four-month Mars food study', + u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + + return {'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 000000000..77bb0a8dc --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,90 @@ +# encoding: utf-8 + +import json +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?' + + _TEST = { + u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', + u'file': u'382479172.mp4', + u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', + u'info_dict': { + u'title': u'MV:Far East Movement《The Illest》', + }, + } + + def _real_extract(self, url): + + def _fetch_data(vid_id): + base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + data_url = base_data_url + str(vid_id) + data_json = self._download_webpage( + data_url, video_id, + note=u'Downloading JSON data for ' + str(vid_id)) + return json.loads(data_json) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>', + webpage, u'video title') + title = raw_title.partition('-')[0].strip() + + vid = self._html_search_regex(r'var vid="(\d+)"', webpage, + u'video path') + data = _fetch_data(vid) + + QUALITIES = ('ori', 'super', 'high', 'nor') + vid_ids = [data['data'][q + 'Vid'] + for q in QUALITIES + if data['data'][q + 'Vid'] != 0] + if not vid_ids: + raise ExtractorError(u'No formats available for this video') + + # For now, we just pick the highest available quality + vid_id = vid_ids[-1] + + format_data = data if vid == vid_id else _fetch_data(vid_id) + part_count = format_data['data']['totalBlocks'] + allot = format_data['allot'] + prot = format_data['prot'] + clipsURL = format_data['data']['clipsURL'] + su = format_data['data']['su'] + + playlist = [] + for i in range(part_count): + part_url = ('http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clipsURL[i], su[i])) + part_str = self._download_webpage( + part_url, video_id, + note=u'Downloading part %d of %d' % (i+1, part_count)) + + part_info = part_str.split('|') + video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + video_info = { + 'id': '%s_part%02d' % (video_id, i + 1), + 'title': title, + 'url': video_url, + 'ext': 'mp4', + } + playlist.append(video_info) + + if len(playlist) == 1: + info = playlist[0] + info['id'] = video_id + else: + info = { + '_type': 'playlist', + 'entries': playlist, + 'id': video_id, + } + + return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py new file mode 100644 index 000000000..f278951ba --- /dev/null +++ b/youtube_dl/extractor/trilulilu.py @@ -0,0 +1,73 @@ +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class TriluliluIE(InfoExtractor): + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1", + u'file': u"big-buck-bunny-1.mp4", + u'info_dict': { + u"title": u"Big Buck Bunny", + u"description": u":) pentru copilul din noi", + }, + # Server ignores Range headers (--test) + u"params": { + u"skip_download": True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + log_str = self._search_regex( + r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info') + log = json.loads(log_str) + + format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' + u'video-formats2' % log) + format_str = self._download_webpage( + format_url, video_id, + note=u'Downloading formats', + errnote=u'Error while downloading formats') + + format_doc = xml.etree.ElementTree.fromstring(format_str) + + video_url_template = ( + u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' + u'&source=site&hash=%(hash)s&username=%(userid)s&' + u'key=ministhebest&format=%%s&sig=&exp=' % + log) + formats = [ + { + 'format': fnode.text, + 'url': video_url_template % fnode.text, + } + + for fnode in format_doc.findall('./formats/format') + ] + + info = { + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['format'].partition('-')[0] + + return info diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7d228edac..29c25f0e3 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, unified_strdate, ) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d1156bf42..c85fd4b5a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -12,14 +12,16 @@ from ..utils import ( unescapeHTML, unified_strdate, ) - +from ..aes import ( + aes_decrypt_text +) class YouPornIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)' _TEST = { u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', u'file': u'505835.mp4', - u'md5': u'c37ddbaaa39058c76a7e86c6813423c1', + u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', u'info_dict': { u"upload_date": u"20101221", u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", @@ -75,7 +77,15 @@ class YouPornIE(InfoExtractor): # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) - if(len(links) == 0): + + # Get link of hd video if available + mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage) + if mobj != None: + encrypted_video_url = mobj.group(u'encrypted_video_url') + video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') + links = [video_url] + links + + if not links: raise ExtractorError(u'ERROR: no known formats available for video') self.to_screen(u'Links found: %d' % len(links)) @@ -112,7 +122,7 @@ class YouPornIE(InfoExtractor): self._print_formats(formats) return - req_format = self._downloader.params.get('format', None) + req_format = self._downloader.params.get('format', 'best') self.to_screen(u'Format: %s' % req_format) if req_format is None or req_format == 'best': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index af01c9da0..8e486afd0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 89: return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: |