diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/aparat.py | 56 | ||||
-rw-r--r-- | youtube_dl/extractor/blinkx.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/bliptv.py | 87 | ||||
-rw-r--r-- | youtube_dl/extractor/brightcove.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 171 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 48 | ||||
-rw-r--r-- | youtube_dl/extractor/imdb.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/ivi.py | 154 | ||||
-rw-r--r-- | youtube_dl/extractor/mdr.py | 19 | ||||
-rw-r--r-- | youtube_dl/extractor/ooyala.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/smotri.py | 59 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/vbox7.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 11 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 22 |
17 files changed, 552 insertions, 110 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7f2f8806e..a39a1e2f4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,7 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .anitube import AnitubeIE +from .aparat import AparatIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE @@ -32,6 +33,7 @@ from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .criterion import CriterionIE +from .crunchyroll import CrunchyrollIE from .cspan import CSpanIE from .d8 import D8IE from .dailymotion import ( @@ -82,6 +84,10 @@ from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE from .internetvideoarchive import InternetVideoArchiveIE +from .ivi import ( + IviIE, + IviCompilationIE +) from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py new file mode 100644 index 000000000..7e93bc4df --- /dev/null +++ b/youtube_dl/extractor/aparat.py @@ -0,0 +1,56 @@ +#coding: utf-8 + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + HEADRequest, +) + + +class AparatIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + + _TEST = { + u'url': u'http://www.aparat.com/v/wP8On', + u'file': u'wP8On.mp4', + u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1', + u'info_dict': { + u"title": u"تیم گلکسی 11 - زومیت", + }, + #u'skip': u'Extremely unreliable', + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' + + video_id + u'/vt/frame') + webpage = self._download_webpage(embed_url, video_id) + + video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) + for i, video_url in enumerate(video_urls): + req = HEADRequest(video_url) + res = self._request_webpage( + req, video_id, note=u'Testing video URL %d' % i, errnote=False) + if res: + break + else: + raise ExtractorError(u'No working video URLs found') + + title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title') + thumbnail = self._search_regex( + r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 48f16b692..144ce64cc 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -9,7 +9,7 @@ from ..utils import ( class BlinkxIE(InfoExtractor): - _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/ce/|blinkx:)(?P<id>[^?]+)' + _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' _IE_NAME = u'blinkx' _TEST = { @@ -54,6 +54,10 @@ class BlinkxIE(InfoExtractor): }) elif m['type'] == 'original': duration = m['d'] + elif m['type'] == 'youtube': + yt_id = m['link'] + self.to_screen(u'Youtube video detected: %s' % yt_id) + return self.url_result(yt_id, 'Youtube', video_id=yt_id) elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 5e33a69df..0e63208df 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -70,13 +70,14 @@ class BlipTVIE(InfoExtractor): info = None urlh = self._request_webpage(request, None, False, u'unable to download video info webpage') + if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download basename = url.split('/')[-1] title,ext = os.path.splitext(basename) title = title.decode('UTF-8') ext = ext.replace('.', '') self.report_direct_download(title) - info = { + return { 'id': title, 'url': url, 'uploader': None, @@ -85,49 +86,47 @@ class BlipTVIE(InfoExtractor): 'ext': ext, 'urlhandle': urlh } - if info is None: # Regular URL - try: - json_code_bytes = urlh.read() - json_code = json_code_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) - - try: - json_data = json.loads(json_code) - if 'Post' in json_data: - data = json_data['Post'] - else: - data = json_data - - upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') - if 'additionalMedia' in data: - formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])) - best_format = formats[-1] - video_url = best_format['url'] - else: - video_url = data['media']['url'] - umobj = re.match(self._URL_EXT, video_url) - if umobj is None: - raise ValueError('Can not determine filename extension') - ext = umobj.group(1) - - info = { - 'id': compat_str(data['item_id']), - 'url': video_url, - 'uploader': data['display_name'], - 'upload_date': upload_date, - 'title': data['title'], - 'ext': ext, - 'format': data['media']['mimeType'], - 'thumbnail': data['thumbnailUrl'], - 'description': data['description'], - 'player_url': data['embedUrl'], - 'user_agent': 'iTunes/10.6.1', - } - except (ValueError,KeyError) as err: - raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) - - return [info] + + try: + json_code_bytes = urlh.read() + json_code = json_code_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) + + try: + json_data = json.loads(json_code) + if 'Post' in json_data: + data = json_data['Post'] + else: + data = json_data + + upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') + if 'additionalMedia' in data: + formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])) + best_format = formats[-1] + video_url = best_format['url'] + else: + video_url = data['media']['url'] + umobj = re.match(self._URL_EXT, video_url) + if umobj is None: + raise ValueError('Can not determine filename extension') + ext = umobj.group(1) + + return { + 'id': compat_str(data['item_id']), + 'url': video_url, + 'uploader': data['display_name'], + 'upload_date': upload_date, + 'title': data['title'], + 'ext': ext, + 'format': data['media']['mimeType'], + 'thumbnail': data['thumbnailUrl'], + 'description': data['description'], + 'player_url': data['embedUrl'], + 'user_agent': 'iTunes/10.6.1', + } + except (ValueError, KeyError) as err: + raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) class BlipTVUserIE(InfoExtractor): diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b1b7526ca..f7f0041c0 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -26,7 +26,7 @@ class BrightcoveIE(InfoExtractor): # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', - u'md5': u'8eccab865181d29ec2958f32a6a754f5', + u'md5': u'5423e113865d26e40624dce2e4b45d95', u'note': u'Test Brightcove downloads and detection in GenericIE', u'info_dict': { u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 939249d7b..ba46a7bc7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -170,6 +170,8 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is False: + return False if errnote is None: errnote = u'Unable to download webpage' errmsg = u'%s: %s' % (errnote, compat_str(err)) @@ -263,7 +265,8 @@ class InfoExtractor(object): self.to_screen(u'Logging in') #Methods for following #608 - def url_result(self, url, ie=None, video_id=None): + @staticmethod + def url_result(url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info video_info = {'_type': 'url', @@ -272,7 +275,8 @@ class InfoExtractor(object): if video_id is not None: video_info['id'] = video_id return video_info - def playlist_result(self, entries, playlist_id=None, playlist_title=None): + @staticmethod + def playlist_result(entries, playlist_id=None, playlist_title=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py new file mode 100644 index 000000000..2b66bddbb --- /dev/null +++ b/youtube_dl/extractor/crunchyroll.py @@ -0,0 +1,171 @@ +# encoding: utf-8 +import re, base64, zlib +from hashlib import sha1 +from math import pow, sqrt, floor +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + bytes_to_intlist, + intlist_to_bytes, + unified_strdate, + clean_html, +) +from ..aes import ( + aes_cbc_decrypt, + inc, +) + +class CrunchyrollIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?(?P<url>crunchyroll\.com/[^/]*/[^/?&]*?(?P<video_id>[0-9]+))(?:[/?&]|$)' + _TESTS = [{ + u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', + u'file': u'645513.flv', + #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', + u'info_dict': { + u'title': u'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', + u'description': u'md5:2d17137920c64f2f49981a7797d275ef', + u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + u'uploader': u'Yomiuri Telecasting Corporation (YTV)', + u'upload_date': u'20131013', + }, + u'params': { + # rtmp + u'skip_download': True, + }, + }] + + _FORMAT_IDS = { + u'360': (u'60', u'106'), + u'480': (u'61', u'106'), + u'720': (u'62', u'106'), + u'1080': (u'80', u'108'), + } + + def _decrypt_subtitles(self, data, iv, id): + data = bytes_to_intlist(data) + iv = bytes_to_intlist(iv) + id = int(id) + + def obfuscate_key_aux(count, modulo, start): + output = list(start) + for _ in range(count): + output.append(output[-1] + output[-2]) + # cut off start values + output = output[2:] + output = list(map(lambda x: x % modulo + 33, output)) + return output + + def obfuscate_key(key): + num1 = int(floor(pow(2, 25) * sqrt(6.9))) + num2 = (num1 ^ key) << 5 + num3 = key ^ num1 + num4 = num3 ^ (num3 >> 3) ^ num2 + prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) + shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest()) + # Extend 160 Bit hash to 256 Bit + return shaHash + [0] * 12 + + key = obfuscate_key(id) + class Counter: + __value = iv + def next_value(self): + temp = self.__value + self.__value = inc(self.__value) + return temp + decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) + return zlib.decompress(decrypted_data) + + def _convert_subtitles_to_srt(self, subtitles): + i=1 + output = u'' + for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): + start = start.replace(u'.', u',') + end = end.replace(u'.', u',') + text = clean_html(text) + text = text.replace(u'\\N', u'\n') + if not text: + continue + output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text) + i+=1 + return output + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://www.' + mobj.group('url') + video_id = mobj.group(u'video_id') + webpage = self._download_webpage(webpage_url, video_id) + note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, u'trailer-notice', default=u'') + if note_m: + raise ExtractorError(note_m) + + video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, u'video_title', flags=re.DOTALL) + video_title = re.sub(r' {2,}', u' ', video_title) + video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') + if not video_description: + video_description = None + video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL) + if video_upload_date: + video_upload_date = unified_strdate(video_upload_date) + video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, u'video_uploader', fatal=False, flags=re.DOTALL) + + playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url')) + playerdata_req = compat_urllib_request.Request(playerdata_url) + playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url}) + playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info') + + stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, u'stream_id') + video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, u'thumbnail', fatal=False) + + formats = [] + for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt+u'p' + streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/') + # urlencode doesn't work! + streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format + streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data))) + streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format) + video_url = self._search_regex(r'<host>([^<]+)', streamdata, u'video_url') + video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, u'video_play_path') + formats.append({ + u'url': video_url, + u'play_path': video_play_path, + u'ext': 'flv', + u'format': video_format, + u'format_id': video_format, + }) + + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ + video_id, note=u'Downloading subtitles for '+sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False) + iv = self._search_regex(r'<iv>([^<]+)', sub_page, u'subtitle_iv', fatal=False) + data = self._search_regex(r'<data>([^<]+)', sub_page, u'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8') + lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False) + if not lang_code: + continue + subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) + + return { + u'id': video_id, + u'title': video_title, + u'description': video_description, + u'thumbnail': video_thumbnail, + u'uploader': video_uploader, + u'upload_date': video_upload_date, + u'subtitles': subtitles, + u'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index fd32370c2..7a14c98f9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,12 +11,14 @@ from ..utils import ( compat_urlparse, ExtractorError, + HEADRequest, smuggle_url, unescapeHTML, unified_strdate, url_basename, ) from .brightcove import BrightcoveIE +from .ooyala import OoyalaIE class GenericIE(InfoExtractor): @@ -83,7 +85,17 @@ class GenericIE(InfoExtractor): u'title': u'trailer', u'upload_date': u'20100513', } - } + }, + # ooyala video + { + u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c', + u'info_dict': { + u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', + u'ext': u'mp4', + u'title': u'2cc213299525360.mov', #that's what we get + }, + }, ] def report_download_webpage(self, video_id): @@ -98,21 +110,18 @@ class GenericIE(InfoExtractor): def _send_head(self, url): """Check if it is a redirect, like url shorteners, in case return the new url.""" - class HeadRequest(compat_urllib_request.Request): - def get_method(self): - return "HEAD" class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ Subclass the HTTPRedirectHandler to make it use our - HeadRequest also on the redirected URL + HEADRequest also on the redirected URL """ def redirect_request(self, req, fp, code, msg, headers, newurl): if code in (301, 302, 303, 307): newurl = newurl.replace(' ', '%20') newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, + return HEADRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(), unverifiable=True) @@ -141,7 +150,7 @@ class GenericIE(InfoExtractor): compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) - response = opener.open(HeadRequest(url)) + response = opener.open(HEADRequest(url)) if response is None: raise ExtractorError(u'Invalid URL protocol') return response @@ -213,7 +222,7 @@ class GenericIE(InfoExtractor): self.to_screen(u'Brightcove video detected.') return self.url_result(bc_url, 'Brightcove') - # Look for embedded Vimeo player + # Look for embedded (iframe) Vimeo player mobj = re.search( r'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage) if mobj: @@ -221,9 +230,18 @@ class GenericIE(InfoExtractor): surl = smuggle_url(player_url, {'Referer': url}) return self.url_result(surl, 'Vimeo') + # Look for embedded (swf embed) Vimeo player + mobj = re.search( + r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage) + if mobj: + return self.url_result(mobj.group(1), 'Vimeo') + # Look for embedded YouTube player - matches = re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage) + matches = re.findall(r'''(?x) + (?:<iframe[^>]+?src=|embedSWF\(\s*) + (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/ + (?:embed|v)/.+?) + \1''', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') for tuppl in matches] @@ -277,6 +295,16 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for Ooyala videos + mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage) + if mobj is not None: + return OoyalaIE._build_url_result(mobj.group(1)) + + # Look for Aparat videos + mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage) + if mobj is not None: + return self.url_result(mobj.group(1), 'Aparat') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 6fb373db2..e5332cce8 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -11,7 +11,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = u'imdb' IE_DESC = u'Internet Movie Database trailers' - _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { u'url': u'http://www.imdb.com/video/imdb/vi2524815897', @@ -27,7 +27,7 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url,video_id) + webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) descr = get_element_by_attribute('itemprop', 'description', webpage) available_formats = re.findall( r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py new file mode 100644 index 000000000..4bdf55f93 --- /dev/null +++ b/youtube_dl/extractor/ivi.py @@ -0,0 +1,154 @@ +# encoding: utf-8 + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + ExtractorError, +) + + +class IviIE(InfoExtractor): + IE_DESC = u'ivi.ru' + IE_NAME = u'ivi' + _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)' + + _TESTS = [ + # Single movie + { + u'url': u'http://www.ivi.ru/watch/53141', + u'file': u'53141.mp4', + u'md5': u'6ff5be2254e796ed346251d117196cf4', + u'info_dict': { + u'title': u'Иван Васильевич меняет профессию', + u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346', + u'duration': 5498, + u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', + }, + u'skip': u'Only works from Russia', + }, + # Serial's serie + { + u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791', + u'file': u'74791.mp4', + u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9', + u'info_dict': { + u'title': u'Дежурный ангел - 1 серия', + u'duration': 2490, + u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', + }, + u'skip': u'Only works from Russia', + } + ] + + # Sorted by quality + _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] + + # Sorted by size + _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480'] + + def _extract_description(self, html): + m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html) + return m.group('description') if m is not None else None + + def _extract_comment_count(self, html): + m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) + return int(m.group('commentcount')) if m is not None else 0 + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + + api_url = 'http://api.digitalaccess.ru/api/json/' + + data = {u'method': u'da.content.get', + u'params': [video_id, {u'site': u's183', + u'referrer': u'http://www.ivi.ru/watch/%s' % video_id, + u'contentid': video_id + } + ] + } + + request = compat_urllib_request.Request(api_url, json.dumps(data)) + + video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON') + video_json = json.loads(video_json_page) + + if u'error' in video_json: + error = video_json[u'error'] + if error[u'origin'] == u'NoRedisValidData': + raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) + raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True) + + result = video_json[u'result'] + + formats = [{'url': x[u'url'], + 'format_id': x[u'content_format'] + } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + + if len(formats) == 0: + self._downloader.report_warning(u'No media links available for %s' % video_id) + return + + duration = result[u'duration'] + compilation = result[u'compilation'] + title = result[u'title'] + + title = '%s - %s' % (compilation, title) if compilation is not None else title + + previews = result[u'preview'] + previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) + thumbnail = previews[-1][u'url'] if len(previews) > 0 else None + + video_page = self._download_webpage(url, video_id, u'Downloading video page') + description = self._extract_description(video_page) + comment_count = self._extract_comment_count(video_page) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + 'comment_count': comment_count, + 'formats': formats, + } + + +class IviCompilationIE(InfoExtractor): + IE_DESC = u'ivi.ru compilations' + IE_NAME = u'ivi:compilation' + _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$' + + def _extract_entries(self, html, compilation_id): + return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi') + for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + compilation_id = mobj.group('compilationid') + season_id = mobj.group('seasonid') + + if season_id is not None: # Season link + season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id) + playlist_id = '%s/season%s' % (compilation_id, season_id) + playlist_title = self._html_search_meta(u'title', season_page, u'title') + entries = self._extract_entries(season_page, compilation_id) + else: # Compilation link + compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page') + playlist_id = compilation_id + playlist_title = self._html_search_meta(u'title', compilation_page, u'title') + seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page) + if len(seasons) == 0: # No seasons in this compilation + entries = self._extract_entries(compilation_page, compilation_id) + else: + entries = [] + for season_id in seasons: + season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), + compilation_id, u'Downloading season %s web page' % season_id) + entries.extend(self._extract_entries(season_page, compilation_id)) + + return self.playlist_result(entries, playlist_id, playlist_title)
\ No newline at end of file diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index d29cf2c07..08ce0647f 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -8,23 +8,8 @@ from ..utils import ( class MDRIE(InfoExtractor): _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*' - - _TESTS = [{ - u'url': u'http://www.mdr.de/mediathek/themen/nachrichten/video165624_zc-c5c7de76_zs-3795826d.html', - u'file': u'165624.mp4', - u'md5': u'ae785f36ecbf2f19b42edf1bc9c85815', - u'info_dict': { - u"title": u"MDR aktuell Eins30 09.12.2013, 22:48 Uhr" - }, - }, - { - u'url': u'http://www.mdr.de/mediathek/radio/mdr1-radio-sachsen/audio718370_zc-67b21197_zs-1b9b2483.html', - u'file': u'718370.mp3', - u'md5': u'a9d21345a234c7b45dee612f290fd8d7', - u'info_dict': { - u"title": u"MDR 1 RADIO SACHSEN 10.12.2013, 05:00 Uhr" - }, - }] + + # No tests, MDR regularily deletes its videos def _real_extract(self, url): m = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 1f7b4d2e7..d08e47734 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -22,6 +22,11 @@ class OoyalaIE(InfoExtractor): def _url_for_embed_code(embed_code): return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + @classmethod + def _build_url_result(cls, embed_code): + return cls.url_result(cls._url_for_embed_code(embed_code), + ie=cls.ie_key()) + def _extract_result(self, info, more_info): return {'id': info['embedCode'], 'ext': 'mp4', diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 4ea89bf85..beea58d63 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,5 +1,6 @@ # encoding: utf-8 +import os.path import re import json import hashlib @@ -10,6 +11,7 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, ExtractorError, + url_basename, ) @@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor): # We will extract some from the video web page instead video_page_url = 'http://' + mobj.group('url') video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') - + + # Warning if video is unavailable + warning = self._html_search_regex( + r'<div class="videoUnModer">(.*?)</div>', video_page, + u'warning messagef', default=None) + if warning is not None: + self._downloader.report_warning( + u'Video %s may not be available; smotri said: %s ' % + (video_id, warning)) + # Adult content if re.search(u'EroConfirmText">', video_page) is not None: self.report_age_confirmation() @@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor): # Extract the rest of meta data video_title = self._search_meta(u'name', video_page, u'title') if not video_title: - video_title = video_url.rsplit('/', 1)[-1] + video_title = os.path.splitext(url_basename(video_url))[0] video_description = self._search_meta(u'description', video_page) END_TEXT = u' на сайте Smotri.com' - if video_description.endswith(END_TEXT): + if video_description and video_description.endswith(END_TEXT): video_description = video_description[:-len(END_TEXT)] START_TEXT = u'Смотреть онлайн ролик ' - if video_description.startswith(START_TEXT): + if video_description and video_description.startswith(START_TEXT): video_description = video_description[len(START_TEXT):] video_thumbnail = self._search_meta(u'thumbnail', video_page) upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') - upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) - video_upload_date = ( - ( - upload_date_m.group('year') + - upload_date_m.group('month') + - upload_date_m.group('day') + if upload_date_str: + upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) + video_upload_date = ( + ( + upload_date_m.group('year') + + upload_date_m.group('month') + + upload_date_m.group('day') + ) + if upload_date_m else None ) - if upload_date_m else None - ) + else: + video_upload_date = None duration_str = self._search_meta(u'duration', video_page) - duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) - video_duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) + if duration_str: + duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) + video_duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m else None ) - if duration_m else None - ) + else: + video_duration = None video_uploader = self._html_search_regex( u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', @@ -202,7 +219,7 @@ class SmotriIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, - 'video_duration': video_duration, + 'duration': video_duration, 'view_count': video_view_count, 'age_limit': 18 if adult_content else 0, 'video_page_url': video_page_url diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cbba4094b..e22ff9c38 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -24,7 +24,7 @@ class SoundcloudIE(InfoExtractor): """ _VALID_URL = r'''^(?:https?://)? - (?:(?:(?:www\.)?soundcloud\.com/ + (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ (?!sets/)(?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 4f803bcd3..5a136a952 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -15,7 +15,7 @@ class Vbox7IE(InfoExtractor): _TEST = { u'url': u'http://vbox7.com/play:249bb972c2', u'file': u'249bb972c2.flv', - u'md5': u'9c70d6d956f888bdc08c124acc120cfe', + u'md5': u'99f65c0c9ef9b682b97313e052734c3f', u'info_dict': { u"title": u"\u0421\u043c\u044f\u0445! \u0427\u0443\u0434\u043e - \u0447\u0438\u0441\u0442 \u0437\u0430 \u0441\u0435\u043a\u0443\u043d\u0434\u0438 - \u0421\u043a\u0440\u0438\u0442\u0430 \u043a\u0430\u043c\u0435\u0440\u0430" } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ea4409528..c3623fcbe 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,11 +16,20 @@ from ..utils import ( unsmuggle_url, ) + class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' + _VALID_URL = r'''(?x) + (?P<proto>https?://)? + (?:(?:www|(?P<player>player))\.)? + vimeo(?P<pro>pro)?\.com/ + (?:.*?/)? + (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? + (?:videos?/)? + (?P<id>[0-9]+) + /?(?:[?&].*)?(?:[#].*)?$''' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 58d274970..9fb07b366 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1666,7 +1666,7 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - video_ids = [] + url_results = [] for pagenum in itertools.count(0): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 @@ -1684,10 +1684,17 @@ class YoutubeUserIE(InfoExtractor): break # Extract video identifiers - ids_in_page = [] - for entry in response['feed']['entry']: - ids_in_page.append(entry['id']['$t'].split('/')[-1]) - video_ids.extend(ids_in_page) + entries = response['feed']['entry'] + for entry in entries: + title = entry['title']['$t'] + video_id = entry['id']['$t'].split('/')[-1] + url_results.append({ + '_type': 'url', + 'url': video_id, + 'ie_key': 'Youtube', + 'id': 'video_id', + 'title': title, + }) # A little optimization - if current page is not # "full", ie. does not contain PAGE_SIZE video ids then @@ -1695,12 +1702,9 @@ class YoutubeUserIE(InfoExtractor): # are no more ids on further pages - no need to query # again. - if len(ids_in_page) < self._GDATA_PAGE_SIZE: + if len(entries) < self._GDATA_PAGE_SIZE: break - url_results = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] return self.playlist_result(url_results, playlist_title=username) |