diff options
Diffstat (limited to 'youtube_dl/extractor')
35 files changed, 1367 insertions, 318 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1149dc1ec..a39a1e2f4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,8 @@ -from .appletrailers import AppleTrailersIE +from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .anitube import AnitubeIE +from .aparat import AparatIE +from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ( @@ -13,6 +15,7 @@ from .arte import ( from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE +from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .breakcom import BreakIE @@ -20,6 +23,8 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cbs import CBSIE +from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .clipsyndicate import ClipsyndicateIE @@ -28,6 +33,7 @@ from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .criterion import CriterionIE +from .crunchyroll import CrunchyrollIE from .cspan import CSpanIE from .d8 import D8IE from .dailymotion import ( @@ -78,6 +84,10 @@ from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE from .internetvideoarchive import InternetVideoArchiveIE +from .ivi import ( + IviIE, + IviCompilationIE +) from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE @@ -87,6 +97,7 @@ from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE, LivestreamOriginalIE +from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE @@ -111,9 +122,11 @@ from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .podomatic import PodomaticIE +from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .pyvideo import PyvideoIE +from .radiofrance import RadioFranceIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py new file mode 100644 index 000000000..ac05f8246 --- /dev/null +++ b/youtube_dl/extractor/academicearth.py @@ -0,0 +1,31 @@ +import re + +from .common import InfoExtractor + + +class AcademicEarthCourseIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)' + IE_NAME = u'AcademicEarth:Course' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + playlist_id = m.group('id') + + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_regex( + r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title') + description = self._html_search_regex( + r'<p class="excerpt">(.*?)</p>', + webpage, u'description', fatal=False) + urls = re.findall( + r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">', + webpage) + entries = [self.url_result(u) for u in urls] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'description': description, + 'entries': entries, + } diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py new file mode 100644 index 000000000..7e93bc4df --- /dev/null +++ b/youtube_dl/extractor/aparat.py @@ -0,0 +1,56 @@ +#coding: utf-8 + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + HEADRequest, +) + + +class AparatIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + + _TEST = { + u'url': u'http://www.aparat.com/v/wP8On', + u'file': u'wP8On.mp4', + u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1', + u'info_dict': { + u"title": u"تیم گلکسی 11 - زومیت", + }, + #u'skip': u'Extremely unreliable', + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' + + video_id + u'/vt/frame') + webpage = self._download_webpage(embed_url, video_id) + + video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) + for i, video_url in enumerate(video_urls): + req = HEADRequest(video_url) + res = self._request_webpage( + req, video_id, note=u'Testing video URL %d' % i, errnote=False) + if res: + break + else: + raise ExtractorError(u'No working video URLs found') + + title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title') + thumbnail = self._search_regex( + r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 4b7bef775..9254fbfe0 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -266,20 +266,6 @@ class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = u'arte.tv:ddc' _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' - _TEST = { - u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien', - u'file': u'049881-009_PLUS7-D.flv', - u'info_dict': { - u'title': u'Mit offenen Karten', - u'description': u'md5:57929b0eaeddeb8a0c983f58e9ebd3b6', - u'upload_date': u'20131207', - }, - u'params': { - # rtmp download - u'skip_download': True, - }, - } - def _real_extract(self, url): video_id, lang = self._extract_url_info(url) if lang == 'folge': diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py new file mode 100644 index 000000000..144ce64cc --- /dev/null +++ b/youtube_dl/extractor/blinkx.py @@ -0,0 +1,90 @@ +import datetime +import json +import re + +from .common import InfoExtractor +from ..utils import ( + remove_start, +) + + +class BlinkxIE(InfoExtractor): + _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' + _IE_NAME = u'blinkx' + + _TEST = { + u'url': u'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB', + u'file': u'8aQUy7GV.mp4', + u'md5': u'2e9a07364af40163a908edbf10bb2492', + u'info_dict': { + u"title": u"Police Car Rolls Away", + u"uploader": u"stupidvideos.com", + u"upload_date": u"20131215", + u"description": u"A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!", + u"duration": 14.886, + u"thumbnails": [{ + "width": 100, + "height": 76, + "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg", + }], + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + display_id = video_id[:8] + + api_url = (u'https://apib4.blinkx.com/api.php?action=play_video&' + + u'video=%s' % video_id) + data_json = self._download_webpage(api_url, display_id) + data = json.loads(data_json)['api']['results'][0] + dt = datetime.datetime.fromtimestamp(data['pubdate_epoch']) + upload_date = dt.strftime('%Y%m%d') + + duration = None + thumbnails = [] + formats = [] + for m in data['media']: + if m['type'] == 'jpg': + thumbnails.append({ + 'url': m['link'], + 'width': int(m['w']), + 'height': int(m['h']), + }) + elif m['type'] == 'original': + duration = m['d'] + elif m['type'] == 'youtube': + yt_id = m['link'] + self.to_screen(u'Youtube video detected: %s' % yt_id) + return self.url_result(yt_id, 'Youtube', video_id=yt_id) + elif m['type'] in ('flv', 'mp4'): + vcodec = remove_start(m['vcodec'], 'ff') + acodec = remove_start(m['acodec'], 'ff') + format_id = (u'%s-%sk-%s' % + (vcodec, + (int(m['vbr']) + int(m['abr'])) // 1000, + m['w'])) + formats.append({ + 'format_id': format_id, + 'url': m['link'], + 'vcodec': vcodec, + 'acodec': acodec, + 'abr': int(m['abr']) // 1000, + 'vbr': int(m['vbr']) // 1000, + 'width': int(m['w']), + 'height': int(m['h']), + }) + formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr'])) + + return { + 'id': display_id, + 'fullid': video_id, + 'title': data['title'], + 'formats': formats, + 'uploader': data['channel_name'], + 'upload_date': upload_date, + 'description': data.get('description'), + 'thumbnails': thumbnails, + 'duration': duration, + } diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 5e33a69df..0e63208df 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -70,13 +70,14 @@ class BlipTVIE(InfoExtractor): info = None urlh = self._request_webpage(request, None, False, u'unable to download video info webpage') + if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download basename = url.split('/')[-1] title,ext = os.path.splitext(basename) title = title.decode('UTF-8') ext = ext.replace('.', '') self.report_direct_download(title) - info = { + return { 'id': title, 'url': url, 'uploader': None, @@ -85,49 +86,47 @@ class BlipTVIE(InfoExtractor): 'ext': ext, 'urlhandle': urlh } - if info is None: # Regular URL - try: - json_code_bytes = urlh.read() - json_code = json_code_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) - - try: - json_data = json.loads(json_code) - if 'Post' in json_data: - data = json_data['Post'] - else: - data = json_data - - upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') - if 'additionalMedia' in data: - formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])) - best_format = formats[-1] - video_url = best_format['url'] - else: - video_url = data['media']['url'] - umobj = re.match(self._URL_EXT, video_url) - if umobj is None: - raise ValueError('Can not determine filename extension') - ext = umobj.group(1) - - info = { - 'id': compat_str(data['item_id']), - 'url': video_url, - 'uploader': data['display_name'], - 'upload_date': upload_date, - 'title': data['title'], - 'ext': ext, - 'format': data['media']['mimeType'], - 'thumbnail': data['thumbnailUrl'], - 'description': data['description'], - 'player_url': data['embedUrl'], - 'user_agent': 'iTunes/10.6.1', - } - except (ValueError,KeyError) as err: - raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) - - return [info] + + try: + json_code_bytes = urlh.read() + json_code = json_code_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) + + try: + json_data = json.loads(json_code) + if 'Post' in json_data: + data = json_data['Post'] + else: + data = json_data + + upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') + if 'additionalMedia' in data: + formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])) + best_format = formats[-1] + video_url = best_format['url'] + else: + video_url = data['media']['url'] + umobj = re.match(self._URL_EXT, video_url) + if umobj is None: + raise ValueError('Can not determine filename extension') + ext = umobj.group(1) + + return { + 'id': compat_str(data['item_id']), + 'url': video_url, + 'uploader': data['display_name'], + 'upload_date': upload_date, + 'title': data['title'], + 'ext': ext, + 'format': data['media']['mimeType'], + 'thumbnail': data['thumbnailUrl'], + 'description': data['description'], + 'player_url': data['embedUrl'], + 'user_agent': 'iTunes/10.6.1', + } + except (ValueError, KeyError) as err: + raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) class BlipTVUserIE(InfoExtractor): diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b1b7526ca..f7f0041c0 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -26,7 +26,7 @@ class BrightcoveIE(InfoExtractor): # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', - u'md5': u'8eccab865181d29ec2958f32a6a754f5', + u'md5': u'5423e113865d26e40624dce2e4b45d95', u'note': u'Test Brightcove downloads and detection in GenericIE', u'info_dict': { u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py new file mode 100644 index 000000000..ac0315853 --- /dev/null +++ b/youtube_dl/extractor/cbs.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor + + +class CBSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*' + + _TEST = { + u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + u'file': u'4JUVEwq3wUT7.flv', + u'info_dict': { + u'title': u'Connect Chat feat. Garth Brooks', + u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + u'duration': 1495, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + real_id = self._search_regex( + r"video\.settings\.pid\s*=\s*'([^']+)';", + webpage, u'real video ID') + return self.url_result(u'theplatform:%s' % real_id) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py new file mode 100644 index 000000000..ae70ea229 --- /dev/null +++ b/youtube_dl/extractor/channel9.py @@ -0,0 +1,267 @@ +# encoding: utf-8 + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + +class Channel9IE(InfoExtractor): + ''' + Common extractor for channel9.msdn.com. + + The type of provided URL (video or playlist) is determined according to + meta Search.PageType from web page HTML rather than URL itself, as it is + not always possible to do. + ''' + IE_DESC = u'Channel 9' + IE_NAME = u'channel9' + _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' + + _TESTS = [ + { + u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + u'file': u'Events_TechEd_Australia_2013_KOS002.mp4', + u'md5': u'bbd75296ba47916b754e73c3a4bbdf10', + u'info_dict': { + u'title': u'Developer Kick-Off Session: Stuff We Love', + u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f', + u'duration': 4576, + u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + u'session_code': u'KOS002', + u'session_day': u'Day 1', + u'session_room': u'Arena 1A', + u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ], + }, + }, + { + u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', + u'md5': u'b43ee4529d111bc37ba7ee4f34813e68', + u'info_dict': { + u'title': u'Self-service BI with Power BI - nuclear testing', + u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', + u'duration': 1540, + u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + u'authors': [ u'Mike Wilmot' ], + }, + } + ] + + _RSS_URL = 'http://channel9.msdn.com/%s/RSS' + + # Sorted by quality + _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] + + def _restore_bytes(self, formatted_size): + if not formatted_size: + return 0 + m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size) + if not m: + return 0 + units = m.group('units') + try: + exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper()) + except ValueError: + return 0 + size = float(m.group('size')) + return int(size * (1024 ** exponent)) + + def _formats_from_html(self, html): + FORMAT_REGEX = r''' + (?x) + <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* + <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* + (?:<div\s+class="popup\s+rounded">\s* + <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* + </div>)? # File size part may be missing + ''' + # Extract known formats + formats = [{'url': x.group('url'), + 'format_id': x.group('quality'), + 'format_note': x.group('note'), + 'format': '%s (%s)' % (x.group('quality'), x.group('note')), + 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate + } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + # Sort according to known formats list + formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + return formats + + def _extract_title(self, html): + title = self._html_search_meta(u'title', html, u'title') + if title is None: + title = self._og_search_title(html) + TITLE_SUFFIX = u' (Channel 9)' + if title is not None and title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + return title + + def _extract_description(self, html): + DESCRIPTION_REGEX = r'''(?sx) + <div\s+class="entry-content">\s* + <div\s+id="entry-body">\s* + (?P<description>.+?)\s* + </div>\s* + </div> + ''' + m = re.search(DESCRIPTION_REGEX, html) + if m is not None: + return m.group('description') + return self._html_search_meta(u'description', html, u'description') + + def _extract_duration(self, html): + m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) + return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None + + def _extract_slides(self, html): + m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html) + return m.group('slidesurl') if m is not None else None + + def _extract_zip(self, html): + m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html) + return m.group('zipurl') if m is not None else None + + def _extract_avg_rating(self, html): + m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html) + return float(m.group('avgrating')) if m is not None else 0 + + def _extract_rating_count(self, html): + m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html) + return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0 + + def _extract_view_count(self, html): + m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html) + return int(self._fix_count(m.group('viewcount'))) if m is not None else 0 + + def _extract_comment_count(self, html): + m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html) + return int(self._fix_count(m.group('commentcount'))) if m is not None else 0 + + def _fix_count(self, count): + return int(str(count).replace(',', '')) if count is not None else None + + def _extract_authors(self, html): + m = re.search(r'(?s)<li class="author">(.*?)</li>', html) + if m is None: + return None + return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1)) + + def _extract_session_code(self, html): + m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html) + return m.group('code') if m is not None else None + + def _extract_session_day(self, html): + m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html) + return m.group('day') if m is not None else None + + def _extract_session_room(self, html): + m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html) + return m.group('room') if m is not None else None + + def _extract_session_speakers(self, html): + return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) + + def _extract_content(self, html, content_path): + # Look for downloadable content + formats = self._formats_from_html(html) + slides = self._extract_slides(html) + zip_ = self._extract_zip(html) + + # Nothing to download + if len(formats) == 0 and slides is None and zip_ is None: + self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path) + return + + # Extract meta + title = self._extract_title(html) + description = self._extract_description(html) + thumbnail = self._og_search_thumbnail(html) + duration = self._extract_duration(html) + avg_rating = self._extract_avg_rating(html) + rating_count = self._extract_rating_count(html) + view_count = self._extract_view_count(html) + comment_count = self._extract_comment_count(html) + + common = {'_type': 'video', + 'id': content_path, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'avg_rating': avg_rating, + 'rating_count': rating_count, + 'view_count': view_count, + 'comment_count': comment_count, + } + + result = [] + + if slides is not None: + d = common.copy() + d.update({ 'title': title + '-Slides', 'url': slides }) + result.append(d) + + if zip_ is not None: + d = common.copy() + d.update({ 'title': title + '-Zip', 'url': zip_ }) + result.append(d) + + if len(formats) > 0: + d = common.copy() + d.update({ 'title': title, 'formats': formats }) + result.append(d) + + return result + + def _extract_entry_item(self, html, content_path): + contents = self._extract_content(html, content_path) + if contents is None: + return contents + + authors = self._extract_authors(html) + + for content in contents: + content['authors'] = authors + + return contents + + def _extract_session(self, html, content_path): + contents = self._extract_content(html, content_path) + if contents is None: + return contents + + session_meta = {'session_code': self._extract_session_code(html), + 'session_day': self._extract_session_day(html), + 'session_room': self._extract_session_room(html), + 'session_speakers': self._extract_session_speakers(html), + } + + for content in contents: + content.update(session_meta) + + return contents + + def _extract_list(self, content_path): + rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS') + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, content_path, title_text) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + content_path = mobj.group('contentpath') + + webpage = self._download_webpage(url, content_path, u'Downloading web page') + + page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage) + if page_type_m is None: + raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True) + + page_type = page_type_m.group('pagetype') + if page_type == 'List': # List page, may contain list of 'item'-like objects + return self._extract_list(content_path) + elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content + return self._extract_entry_item(webpage, content_path) + elif page_type == 'Session': # Event session page, may contain downloadable content + return self._extract_session(webpage, content_path) + else: + raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 69a083b68..ba46a7bc7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,6 +18,7 @@ from ..utils import ( sanitize_filename, unescapeHTML, ) +_NO_DEFAULT = object() class InfoExtractor(object): @@ -34,15 +35,39 @@ class InfoExtractor(object): The dictionaries must include the following fields: id: Video identifier. - url: Final video URL. title: Video title, unescaped. - ext: Video filename extension. - Instead of url and ext, formats can also specified. + Additionally, it must contain either a formats entry or url and ext: + + formats: A list of dictionaries for each format available, it must + be ordered from worst to best quality. Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19") + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * vbr Average video bitrate in KBit/s + * vcodec Name of the video codec in use + * filesize The number of bytes, if known in advance + * player_url SWF Player URL (used for rtmpdump). + url: Final video URL. + ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). + urlhandle: [internal] The urlHandle to be used to download the file, + like returned by urllib.request.urlopen The following fields are optional: - format: The video format, defaults to ext (used for --get-format) thumbnails: A list of dictionaries (with the entries "resolution" and "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. @@ -51,35 +76,14 @@ class InfoExtractor(object): upload_date: Video upload date (YYYYMMDD). uploader_id: Nickname or id of the video uploader. location: Physical location of the video. - player_url: SWF Player URL (used for rtmpdump). subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. + duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen age_limit: Age restriction for the video, as an integer (years) - formats: A list of dictionaries for each format available, it must - be ordered from worst to best quality. Potential fields: - * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing - * format A human-readable description of the format - ("mp4 container with h264/opus"). - Calculated from the format_id, width, height. - and format_note fields if missing. - * format_id A short description of the format - ("mp4_h264_opus" or "19") - * format_note Additional info about the format - ("3D" or "DASH video") - * width Width of the video, if known - * height Height of the video, if known - * abr Average audio bitrate in KBit/s - * acodec Name of the audio codec in use - * vbr Average video bitrate in KBit/s - * vcodec Name of the video codec in use - * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) @@ -166,6 +170,8 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is False: + return False if errnote is None: errnote = u'Unable to download webpage' errmsg = u'%s: %s' % (errnote, compat_str(err)) @@ -259,7 +265,8 @@ class InfoExtractor(object): self.to_screen(u'Logging in') #Methods for following #608 - def url_result(self, url, ie=None, video_id=None): + @staticmethod + def url_result(url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info video_info = {'_type': 'url', @@ -268,7 +275,8 @@ class InfoExtractor(object): if video_id is not None: video_info['id'] = video_id return video_info - def playlist_result(self, entries, playlist_id=None, playlist_title=None): + @staticmethod + def playlist_result(entries, playlist_id=None, playlist_title=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} @@ -278,7 +286,7 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -292,7 +300,7 @@ class InfoExtractor(object): mobj = re.search(p, string, flags) if mobj: break - if sys.stderr.isatty() and os.name != 'nt': + if os.name != 'nt' and sys.stderr.isatty(): _name = u'\033[0;34m%s\033[0m' % name else: _name = name @@ -300,7 +308,7 @@ class InfoExtractor(object): if mobj: # return the first matching group return next(g for g in mobj.groups() if g is not None) - elif default is not None: + elif default is not _NO_DEFAULT: return default elif fatal: raise RegexNotFoundError(u'Unable to extract %s' % _name) @@ -309,7 +317,7 @@ class InfoExtractor(object): u'please report this issue on http://yt-dl.org/bug' % _name) return None - def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): """ Like _search_regex, but strips HTML tags and unescapes entities. """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py new file mode 100644 index 000000000..2b66bddbb --- /dev/null +++ b/youtube_dl/extractor/crunchyroll.py @@ -0,0 +1,171 @@ +# encoding: utf-8 +import re, base64, zlib +from hashlib import sha1 +from math import pow, sqrt, floor +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + bytes_to_intlist, + intlist_to_bytes, + unified_strdate, + clean_html, +) +from ..aes import ( + aes_cbc_decrypt, + inc, +) + +class CrunchyrollIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?(?P<url>crunchyroll\.com/[^/]*/[^/?&]*?(?P<video_id>[0-9]+))(?:[/?&]|$)' + _TESTS = [{ + u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', + u'file': u'645513.flv', + #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', + u'info_dict': { + u'title': u'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', + u'description': u'md5:2d17137920c64f2f49981a7797d275ef', + u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + u'uploader': u'Yomiuri Telecasting Corporation (YTV)', + u'upload_date': u'20131013', + }, + u'params': { + # rtmp + u'skip_download': True, + }, + }] + + _FORMAT_IDS = { + u'360': (u'60', u'106'), + u'480': (u'61', u'106'), + u'720': (u'62', u'106'), + u'1080': (u'80', u'108'), + } + + def _decrypt_subtitles(self, data, iv, id): + data = bytes_to_intlist(data) + iv = bytes_to_intlist(iv) + id = int(id) + + def obfuscate_key_aux(count, modulo, start): + output = list(start) + for _ in range(count): + output.append(output[-1] + output[-2]) + # cut off start values + output = output[2:] + output = list(map(lambda x: x % modulo + 33, output)) + return output + + def obfuscate_key(key): + num1 = int(floor(pow(2, 25) * sqrt(6.9))) + num2 = (num1 ^ key) << 5 + num3 = key ^ num1 + num4 = num3 ^ (num3 >> 3) ^ num2 + prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) + shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest()) + # Extend 160 Bit hash to 256 Bit + return shaHash + [0] * 12 + + key = obfuscate_key(id) + class Counter: + __value = iv + def next_value(self): + temp = self.__value + self.__value = inc(self.__value) + return temp + decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) + return zlib.decompress(decrypted_data) + + def _convert_subtitles_to_srt(self, subtitles): + i=1 + output = u'' + for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): + start = start.replace(u'.', u',') + end = end.replace(u'.', u',') + text = clean_html(text) + text = text.replace(u'\\N', u'\n') + if not text: + continue + output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text) + i+=1 + return output + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://www.' + mobj.group('url') + video_id = mobj.group(u'video_id') + webpage = self._download_webpage(webpage_url, video_id) + note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, u'trailer-notice', default=u'') + if note_m: + raise ExtractorError(note_m) + + video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, u'video_title', flags=re.DOTALL) + video_title = re.sub(r' {2,}', u' ', video_title) + video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') + if not video_description: + video_description = None + video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL) + if video_upload_date: + video_upload_date = unified_strdate(video_upload_date) + video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, u'video_uploader', fatal=False, flags=re.DOTALL) + + playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url')) + playerdata_req = compat_urllib_request.Request(playerdata_url) + playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url}) + playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info') + + stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, u'stream_id') + video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, u'thumbnail', fatal=False) + + formats = [] + for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt+u'p' + streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/') + # urlencode doesn't work! + streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format + streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data))) + streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format) + video_url = self._search_regex(r'<host>([^<]+)', streamdata, u'video_url') + video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, u'video_play_path') + formats.append({ + u'url': video_url, + u'play_path': video_play_path, + u'ext': 'flv', + u'format': video_format, + u'format_id': video_format, + }) + + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ + video_id, note=u'Downloading subtitles for '+sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False) + iv = self._search_regex(r'<iv>([^<]+)', sub_page, u'subtitle_iv', fatal=False) + data = self._search_regex(r'<data>([^<]+)', sub_page, u'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8') + lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False) + if not lang_code: + continue + subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) + + return { + u'id': video_id, + u'title': video_title, + u'description': video_description, + u'thumbnail': video_thumbnail, + u'uploader': video_uploader, + u'upload_date': video_upload_date, + u'subtitles': subtitles, + u'formats': formats, + } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index aea7e557e..6685c94a3 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -28,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = u'dailymotion' _FORMATS = [ @@ -81,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('_')[0].split('?')[0] + video_id = mobj.group('id') url = 'http://www.dailymotion.com/video/%s' % video_id diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index d418ce4a8..4876ecb48 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -9,7 +9,7 @@ from ..utils import ( class DaumIE(InfoExtractor): - _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' IE_NAME = u'daum.net' _TEST = { diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 3b210710e..4556079c8 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,7 +17,7 @@ from ..utils import ( class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): u'file': u'120708114770723.mp4', u'md5': u'48975a41ccc4b7a581abd68651c1a5a8', u'info_dict': { - u"duration": 279, + u"duration": 279, u"title": u"PEOPLE ARE AWESOME 2013" } } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 216e03218..7a14c98f9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,10 +11,14 @@ from ..utils import ( compat_urlparse, ExtractorError, + HEADRequest, smuggle_url, unescapeHTML, + unified_strdate, + url_basename, ) from .brightcove import BrightcoveIE +from .ooyala import OoyalaIE class GenericIE(InfoExtractor): @@ -71,6 +75,27 @@ class GenericIE(InfoExtractor): u'skip_download': True, }, }, + # Direct link to a video + { + u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4', + u'file': u'trailer.mp4', + u'md5': u'67d406c2bcb6af27fa886f31aa934bbe', + u'info_dict': { + u'id': u'trailer', + u'title': u'trailer', + u'upload_date': u'20100513', + } + }, + # ooyala video + { + u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c', + u'info_dict': { + u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', + u'ext': u'mp4', + u'title': u'2cc213299525360.mov', #that's what we get + }, + }, ] def report_download_webpage(self, video_id): @@ -83,23 +108,20 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - def _test_redirect(self, url): + def _send_head(self, url): """Check if it is a redirect, like url shorteners, in case return the new url.""" - class HeadRequest(compat_urllib_request.Request): - def get_method(self): - return "HEAD" class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ Subclass the HTTPRedirectHandler to make it use our - HeadRequest also on the redirected URL + HEADRequest also on the redirected URL """ def redirect_request(self, req, fp, code, msg, headers, newurl): if code in (301, 302, 303, 307): newurl = newurl.replace(' ', '%20') newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, + return HEADRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(), unverifiable=True) @@ -128,32 +150,49 @@ class GenericIE(InfoExtractor): compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) - response = opener.open(HeadRequest(url)) + response = opener.open(HEADRequest(url)) if response is None: raise ExtractorError(u'Invalid URL protocol') - new_url = response.geturl() - - if url == new_url: - return False - - self.report_following_redirect(new_url) - return new_url + return response def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) + video_id = os.path.splitext(url.split('/')[-1])[0] try: - new_url = self._test_redirect(url) - if new_url: - return [self.url_result(new_url)] + response = self._send_head(url) + + # Check for redirect + new_url = response.geturl() + if url != new_url: + self.report_following_redirect(new_url) + return self.url_result(new_url) + + # Check for direct link to a video + content_type = response.headers.get('Content-Type', '') + m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) + if m: + upload_date = response.headers.get('Last-Modified') + if upload_date: + upload_date = unified_strdate(upload_date) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + 'vcodec': u'none' if m.group('type') == 'audio' else None + }], + 'upload_date': upload_date, + } + except compat_urllib_error.HTTPError: # This may be a stupid server that doesn't like HEAD, our UA, or so pass - video_id = url.split('/')[-1] try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -183,7 +222,7 @@ class GenericIE(InfoExtractor): self.to_screen(u'Brightcove video detected.') return self.url_result(bc_url, 'Brightcove') - # Look for embedded Vimeo player + # Look for embedded (iframe) Vimeo player mobj = re.search( r'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage) if mobj: @@ -191,9 +230,18 @@ class GenericIE(InfoExtractor): surl = smuggle_url(player_url, {'Referer': url}) return self.url_result(surl, 'Vimeo') + # Look for embedded (swf embed) Vimeo player + mobj = re.search( + r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage) + if mobj: + return self.url_result(mobj.group(1), 'Vimeo') + # Look for embedded YouTube player - matches = re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage) + matches = re.findall(r'''(?x) + (?:<iframe[^>]+?src=|embedSWF\(\s*) + (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/ + (?:embed|v)/.+?) + \1''', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') for tuppl in matches] @@ -222,6 +270,18 @@ class GenericIE(InfoExtractor): 'id': video_id, } + # Look for embedded blip.tv player + mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage) + if mobj: + return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV') + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage) + if mobj: + player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1) + player_page = self._download_webpage(player_url, mobj.group(1)) + blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False) + if blip_video_id: + return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV') + # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -229,6 +289,22 @@ class GenericIE(InfoExtractor): # Don't set the extractor because it can be a track url or an album return self.url_result(burl) + # Look for embedded Vevo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for Ooyala videos + mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage) + if mobj is not None: + return OoyalaIE._build_url_result(mobj.group(1)) + + # Look for Aparat videos + mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage) + if mobj is not None: + return self.url_result(mobj.group(1), 'Aparat') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 57b79a336..381af91e4 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -44,7 +44,7 @@ class IGNIE(InfoExtractor): { u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', u'info_dict': { - u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion', + u'title': u'26 Twisted Moments from GTA 5 in Slow Motion', u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', }, }, diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 6fb373db2..e5332cce8 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -11,7 +11,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = u'imdb' IE_DESC = u'Internet Movie Database trailers' - _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { u'url': u'http://www.imdb.com/video/imdb/vi2524815897', @@ -27,7 +27,7 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url,video_id) + webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) descr = get_element_by_attribute('itemprop', 'description', webpage) available_formats = re.findall( r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py new file mode 100644 index 000000000..4bdf55f93 --- /dev/null +++ b/youtube_dl/extractor/ivi.py @@ -0,0 +1,154 @@ +# encoding: utf-8 + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + ExtractorError, +) + + +class IviIE(InfoExtractor): + IE_DESC = u'ivi.ru' + IE_NAME = u'ivi' + _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)' + + _TESTS = [ + # Single movie + { + u'url': u'http://www.ivi.ru/watch/53141', + u'file': u'53141.mp4', + u'md5': u'6ff5be2254e796ed346251d117196cf4', + u'info_dict': { + u'title': u'Иван Васильевич меняет профессию', + u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346', + u'duration': 5498, + u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', + }, + u'skip': u'Only works from Russia', + }, + # Serial's serie + { + u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791', + u'file': u'74791.mp4', + u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9', + u'info_dict': { + u'title': u'Дежурный ангел - 1 серия', + u'duration': 2490, + u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', + }, + u'skip': u'Only works from Russia', + } + ] + + # Sorted by quality + _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] + + # Sorted by size + _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480'] + + def _extract_description(self, html): + m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html) + return m.group('description') if m is not None else None + + def _extract_comment_count(self, html): + m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) + return int(m.group('commentcount')) if m is not None else 0 + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + + api_url = 'http://api.digitalaccess.ru/api/json/' + + data = {u'method': u'da.content.get', + u'params': [video_id, {u'site': u's183', + u'referrer': u'http://www.ivi.ru/watch/%s' % video_id, + u'contentid': video_id + } + ] + } + + request = compat_urllib_request.Request(api_url, json.dumps(data)) + + video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON') + video_json = json.loads(video_json_page) + + if u'error' in video_json: + error = video_json[u'error'] + if error[u'origin'] == u'NoRedisValidData': + raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) + raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True) + + result = video_json[u'result'] + + formats = [{'url': x[u'url'], + 'format_id': x[u'content_format'] + } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + + if len(formats) == 0: + self._downloader.report_warning(u'No media links available for %s' % video_id) + return + + duration = result[u'duration'] + compilation = result[u'compilation'] + title = result[u'title'] + + title = '%s - %s' % (compilation, title) if compilation is not None else title + + previews = result[u'preview'] + previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) + thumbnail = previews[-1][u'url'] if len(previews) > 0 else None + + video_page = self._download_webpage(url, video_id, u'Downloading video page') + description = self._extract_description(video_page) + comment_count = self._extract_comment_count(video_page) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + 'comment_count': comment_count, + 'formats': formats, + } + + +class IviCompilationIE(InfoExtractor): + IE_DESC = u'ivi.ru compilations' + IE_NAME = u'ivi:compilation' + _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$' + + def _extract_entries(self, html, compilation_id): + return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi') + for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + compilation_id = mobj.group('compilationid') + season_id = mobj.group('seasonid') + + if season_id is not None: # Season link + season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id) + playlist_id = '%s/season%s' % (compilation_id, season_id) + playlist_title = self._html_search_meta(u'title', season_page, u'title') + entries = self._extract_entries(season_page, compilation_id) + else: # Compilation link + compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page') + playlist_id = compilation_id + playlist_title = self._html_search_meta(u'title', compilation_page, u'title') + seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page) + if len(seasons) == 0: # No seasons in this compilation + entries = self._extract_entries(compilation_page, compilation_id) + else: + entries = [] + for season_id in seasons: + season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), + compilation_id, u'Downloading season %s web page' % season_id) + entries.extend(self._extract_entries(season_page, compilation_id)) + + return self.playlist_result(entries, playlist_id, playlist_title)
\ No newline at end of file diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py new file mode 100644 index 000000000..08ce0647f --- /dev/null +++ b/youtube_dl/extractor/mdr.py @@ -0,0 +1,63 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class MDRIE(InfoExtractor): + _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*' + + # No tests, MDR regularily deletes its videos + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('video_id') + domain = m.group('domain') + + # determine title and media streams from webpage + html = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title') + xmlurl = self._search_regex( + r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL') + + doc = self._download_xml(domain + xmlurl, video_id) + formats = [] + for a in doc.findall('./assets/asset'): + url_el = a.find('.//progressiveDownloadUrl') + if url_el is None: + continue + abr = int(a.find('bitrateAudio').text) // 1000 + media_type = a.find('mediaType').text + format = { + 'abr': abr, + 'filesize': int(a.find('fileSize').text), + 'url': url_el.text, + } + + vbr_el = a.find('bitrateVideo') + if vbr_el is None: + format.update({ + 'vcodec': 'none', + 'format_id': u'%s-%d' % (media_type, abr), + }) + else: + vbr = int(vbr_el.text) // 1000 + format.update({ + 'vbr': vbr, + 'width': int(a.find('frameWidth').text), + 'height': int(a.find('frameHeight').text), + 'format_id': u'%s-%d' % (media_type, vbr), + }) + formats.append(format) + formats.sort(key=lambda f: (f.get('vbr'), f['abr'])) + if not formats: + raise ExtractorError(u'Could not find any valid formats') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 5b2bd9633..ed11f521a 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -93,7 +93,9 @@ class MTVServicesInfoExtractor(InfoExtractor): class MTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' + _VALID_URL = r'''(?x)^https?:// + (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$| + m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))''' _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' @@ -127,16 +129,17 @@ class MTVIE(MTVServicesInfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - - webpage = self._download_webpage(url, video_id) - - # Some videos come from Vevo.com - m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";', - webpage, re.DOTALL) - if m_vevo: - vevo_id = m_vevo.group(1); - self.to_screen(u'Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + uri = mobj.group('mgid') + if uri is None: + webpage = self._download_webpage(url, video_id) + + # Some videos come from Vevo.com + m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";', + webpage, re.DOTALL) + if m_vevo: + vevo_id = m_vevo.group(1); + self.to_screen(u'Vevo video detected: %s' % vevo_id) + return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') return self._get_videos_info(uri) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c012ec0cf..4cab30631 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -9,7 +9,7 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TEST = { u'url': u'http://tvcast.naver.com/v/81652', diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 2e8501f99..d81df3c10 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,6 +1,4 @@ -import json import re -import time from .common import InfoExtractor from ..utils import month_by_name diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 1f7b4d2e7..d08e47734 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -22,6 +22,11 @@ class OoyalaIE(InfoExtractor): def _url_for_embed_code(embed_code): return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + @classmethod + def _build_url_result(cls, embed_code): + return cls.url_result(cls._url_for_embed_code(embed_code), + ie=cls.ie_key()) + def _extract_result(self, info, more_info): return {'id': info['embedCode'], 'ext': 'mp4', diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py new file mode 100644 index 000000000..71abd5013 --- /dev/null +++ b/youtube_dl/extractor/pornhd.py @@ -0,0 +1,38 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urllib_parse + + +class PornHdIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' + _TEST = { + u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + u'file': u'1962.flv', + u'md5': u'35272469887dca97abd30abecc6cdf75', + u'info_dict': { + u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('video_id') + video_title = mobj.group('video_title') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'&hd=(http.+?)&', webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) + age_limit = 18 + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py new file mode 100644 index 000000000..34652f6c1 --- /dev/null +++ b/youtube_dl/extractor/radiofrance.py @@ -0,0 +1,55 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + + +class RadioFranceIE(InfoExtractor): + _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + IE_NAME = u'radiofrance' + + _TEST = { + u'url': u'http://maison.radiofrance.fr/radiovisions/one-one', + u'file': u'one-one.ogg', + u'md5': u'bdbb28ace95ed0e04faab32ba3160daf', + u'info_dict': { + u"title": u"One to one", + u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + u"uploader": u"Thomas Hercouët", + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title') + description = self._html_search_regex( + r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', + webpage, u'description', fatal=False) + uploader = self._html_search_regex( + r'<div class="credit"> © (.*?)</div>', + webpage, u'uploader', fatal=False) + + formats_str = self._html_search_regex( + r'class="jp-jplayer[^"]*" data-source="([^"]+)">', + webpage, u'audio URLs') + formats = [ + { + 'format_id': fm[0], + 'url': fm[1], + 'vcodec': 'none', + } + for fm in + re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str) + ] + # No sorting, we don't know any more about these formats + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 511674d8d..ccf0b1546 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -10,7 +10,7 @@ from ..utils import ( class RTLnowIE(InfoExtractor): """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -82,7 +82,7 @@ class RTLnowIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) webpage_url = u'http://' + mobj.group('url') - video_page_url = u'http://' + mobj.group('base_url') + video_page_url = u'http://' + mobj.group('domain') + u'/' video_id = mobj.group(u'video_id') webpage = self._download_webpage(webpage_url, video_id) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 4ea89bf85..beea58d63 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,5 +1,6 @@ # encoding: utf-8 +import os.path import re import json import hashlib @@ -10,6 +11,7 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, ExtractorError, + url_basename, ) @@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor): # We will extract some from the video web page instead video_page_url = 'http://' + mobj.group('url') video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') - + + # Warning if video is unavailable + warning = self._html_search_regex( + r'<div class="videoUnModer">(.*?)</div>', video_page, + u'warning messagef', default=None) + if warning is not None: + self._downloader.report_warning( + u'Video %s may not be available; smotri said: %s ' % + (video_id, warning)) + # Adult content if re.search(u'EroConfirmText">', video_page) is not None: self.report_age_confirmation() @@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor): # Extract the rest of meta data video_title = self._search_meta(u'name', video_page, u'title') if not video_title: - video_title = video_url.rsplit('/', 1)[-1] + video_title = os.path.splitext(url_basename(video_url))[0] video_description = self._search_meta(u'description', video_page) END_TEXT = u' на сайте Smotri.com' - if video_description.endswith(END_TEXT): + if video_description and video_description.endswith(END_TEXT): video_description = video_description[:-len(END_TEXT)] START_TEXT = u'Смотреть онлайн ролик ' - if video_description.startswith(START_TEXT): + if video_description and video_description.startswith(START_TEXT): video_description = video_description[len(START_TEXT):] video_thumbnail = self._search_meta(u'thumbnail', video_page) upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') - upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) - video_upload_date = ( - ( - upload_date_m.group('year') + - upload_date_m.group('month') + - upload_date_m.group('day') + if upload_date_str: + upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) + video_upload_date = ( + ( + upload_date_m.group('year') + + upload_date_m.group('month') + + upload_date_m.group('day') + ) + if upload_date_m else None ) - if upload_date_m else None - ) + else: + video_upload_date = None duration_str = self._search_meta(u'duration', video_page) - duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) - video_duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) + if duration_str: + duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) + video_duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m else None ) - if duration_m else None - ) + else: + video_duration = None video_uploader = self._html_search_regex( u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', @@ -202,7 +219,7 @@ class SmotriIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, - 'video_duration': video_duration, + 'duration': video_duration, 'view_count': video_view_count, 'age_limit': 18 if adult_content else 0, 'video_page_url': video_page_url diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cbba4094b..e22ff9c38 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -24,7 +24,7 @@ class SoundcloudIE(InfoExtractor): """ _VALID_URL = r'''^(?:https?://)? - (?:(?:(?:www\.)?soundcloud\.com/ + (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ (?!sets/)(?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 61452e47d..cec65261b 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -3,6 +3,7 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, xpath_with_ns, ) @@ -32,6 +33,17 @@ class ThePlatformIE(InfoExtractor): smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) meta = self._download_xml(smil_url, video_id) + + try: + error_msg = next( + n.attrib['abstract'] + for n in meta.findall(_x('.//smil:ref')) + if n.attrib.get('title') == u'Geographic Restriction') + except StopIteration: + pass + else: + raise ExtractorError(error_msg, expected=True) + info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) info_json = self._download_webpage(info_url, video_id) info = json.loads(info_json) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 4f803bcd3..5a136a952 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -15,7 +15,7 @@ class Vbox7IE(InfoExtractor): _TEST = { u'url': u'http://vbox7.com/play:249bb972c2', u'file': u'249bb972c2.flv', - u'md5': u'9c70d6d956f888bdc08c124acc120cfe', + u'md5': u'99f65c0c9ef9b682b97313e052734c3f', u'info_dict': { u"title": u"\u0421\u043c\u044f\u0445! \u0427\u0443\u0434\u043e - \u0447\u0438\u0441\u0442 \u0437\u0430 \u0441\u0435\u043a\u0443\u043d\u0434\u0438 - \u0421\u043a\u0440\u0438\u0442\u0430 \u043a\u0430\u043c\u0435\u0440\u0430" } diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4823992ef..a4b26a26f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,12 @@ class VevoIE(InfoExtractor): Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)' + _VALID_URL = r'''(?x) + (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?| + https?://cache\.vevo\.com/m/html/embed\.html\?video=| + https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| + vevo:) + (?P<id>[^&?#]+)''' _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py index acae81448..65463c733 100644 --- a/youtube_dl/extractor/videopremium.py +++ b/youtube_dl/extractor/videopremium.py @@ -15,6 +15,7 @@ class VideoPremiumIE(InfoExtractor): u'params': { u'skip_download': True, }, + u'skip': u'Test file has been deleted.', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ea4409528..c3623fcbe 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,11 +16,20 @@ from ..utils import ( unsmuggle_url, ) + class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' + _VALID_URL = r'''(?x) + (?P<proto>https?://)? + (?:(?:www|(?P<player>player))\.)? + vimeo(?P<pro>pro)?\.com/ + (?:.*?/)? + (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? + (?:videos?/)? + (?P<id>[0-9]+) + /?(?:[?&].*)?(?:[#].*)?$''' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index e3458d2bd..1a6a7688d 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -32,7 +32,7 @@ class XTubeIE(InfoExtractor): video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title') video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False) - video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None) + video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', fatal=False) video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/') path = compat_urllib_parse_urlparse(video_url).path extension = os.path.splitext(path)[1][1:] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 874429b78..55c345e8a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -162,23 +162,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Dash audio '141', '172', '140', '171', '139', ] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', - # Apple HTTP Live Streaming - '96', '95', '94', '93', '92', '132', '151', - # 3D - '85', '102', '84', '101', '83', '100', '82', - # Dash video - '138', '248', '137', '247', '136', '246', '245', - '244', '135', '243', '134', '242', '133', '160', - # Dash audio - '172', '141', '171', '140', '139', - ] - _video_formats_map = { - 'flv': ['35', '34', '6', '5'], - '3gp': ['36', '17', '13'], - 'mp4': ['38', '37', '22', '18'], - 'webm': ['46', '45', '44', '43'], - } _video_extensions = { '13': '3gp', '17': '3gp', @@ -236,54 +219,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '248': 'webm', } _video_dimensions = { - '5': '400x240', - '6': '???', - '13': '???', - '17': '176x144', - '18': '640x360', - '22': '1280x720', - '34': '640x360', - '35': '854x480', - '36': '320x240', - '37': '1920x1080', - '38': '4096x3072', - '43': '640x360', - '44': '854x480', - '45': '1280x720', - '46': '1920x1080', - '82': '360p', - '83': '480p', - '84': '720p', - '85': '1080p', - '92': '240p', - '93': '360p', - '94': '480p', - '95': '720p', - '96': '1080p', - '100': '360p', - '101': '480p', - '102': '720p', - '132': '240p', - '151': '72p', - '133': '240p', - '134': '360p', - '135': '480p', - '136': '720p', - '137': '1080p', - '138': '>1080p', - '139': '48k', - '140': '128k', - '141': '256k', - '160': '192p', - '171': '128k', - '172': '256k', - '242': '240p', - '243': '360p', - '244': '480p', - '245': '480p', - '246': '480p', - '247': '720p', - '248': '1080p', + '5': {'width': 400, 'height': 240}, + '6': {}, + '13': {}, + '17': {'width': 176, 'height': 144}, + '18': {'width': 640, 'height': 360}, + '22': {'width': 1280, 'height': 720}, + '34': {'width': 640, 'height': 360}, + '35': {'width': 854, 'height': 480}, + '36': {'width': 320, 'height': 240}, + '37': {'width': 1920, 'height': 1080}, + '38': {'width': 4096, 'height': 3072}, + '43': {'width': 640, 'height': 360}, + '44': {'width': 854, 'height': 480}, + '45': {'width': 1280, 'height': 720}, + '46': {'width': 1920, 'height': 1080}, + '82': {'height': 360, 'display': '360p'}, + '83': {'height': 480, 'display': '480p'}, + '84': {'height': 720, 'display': '720p'}, + '85': {'height': 1080, 'display': '1080p'}, + '92': {'height': 240, 'display': '240p'}, + '93': {'height': 360, 'display': '360p'}, + '94': {'height': 480, 'display': '480p'}, + '95': {'height': 720, 'display': '720p'}, + '96': {'height': 1080, 'display': '1080p'}, + '100': {'height': 360, 'display': '360p'}, + '101': {'height': 480, 'display': '480p'}, + '102': {'height': 720, 'display': '720p'}, + '132': {'height': 240, 'display': '240p'}, + '151': {'height': 72, 'display': '72p'}, + '133': {'height': 240, 'display': '240p'}, + '134': {'height': 360, 'display': '360p'}, + '135': {'height': 480, 'display': '480p'}, + '136': {'height': 720, 'display': '720p'}, + '137': {'height': 1080, 'display': '1080p'}, + '138': {'height': 1081, 'display': '>1080p'}, + '139': {'display': '48k'}, + '140': {'display': '128k'}, + '141': {'display': '256k'}, + '160': {'height': 192, 'display': '192p'}, + '171': {'display': '128k'}, + '172': {'display': '256k'}, + '242': {'height': 240, 'display': '240p'}, + '243': {'height': 360, 'display': '360p'}, + '244': {'height': 480, 'display': '480p'}, + '245': {'height': 480, 'display': '480p'}, + '246': {'height': 480, 'display': '480p'}, + '247': {'height': 720, 'display': '720p'}, + '248': {'height': 1080, 'display': '1080p'}, } _special_itags = { '82': '3D', @@ -1153,13 +1136,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self._downloader.report_warning(err_msg) return {} - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'), - self._video_dimensions.get(x, '???'), - ' ('+self._special_itags[x]+')' if x in self._special_itags else '')) - def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: @@ -1172,48 +1148,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): Transform a dictionary in the format {itag:url} to a list of (itag, url) with the requested formats. """ - req_format = self._downloader.params.get('format', None) - format_limit = self._downloader.params.get('format_limit', None) - available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats - if format_limit is not None and format_limit in available_formats: - format_list = available_formats[available_formats.index(format_limit):] - else: - format_list = available_formats - existing_formats = [x for x in format_list if x in url_map] + existing_formats = [x for x in self._available_formats if x in url_map] if len(existing_formats) == 0: raise ExtractorError(u'no known formats available for video') - if self._downloader.params.get('listformats', None): - self._print_formats(existing_formats) - return - if req_format is None or req_format == 'best': - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality - elif req_format in ('-1', 'all'): - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific formats. We pick the first in a slash-delimeted sequence. - # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality - # available in the specified format. For example, - # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. - # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. - # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. - req_formats = req_format.split('/') - video_url_list = None - for rf in req_formats: - if rf in url_map: - video_url_list = [(rf, url_map[rf])] - break - if rf in self._video_formats_map: - for srf in self._video_formats_map[rf]: - if srf in url_map: - video_url_list = [(srf, url_map[srf])] - break - else: - continue - break - if video_url_list is None: - raise ExtractorError(u'requested format not available') + video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats + video_url_list.reverse() # order worst to best return video_url_list def _extract_from_m3u8(self, manifest_url, video_id): @@ -1361,7 +1300,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_description = u'' def _extract_count(klass): - count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False) + count = self._search_regex( + r'class="%s">([\d,]+)</span>' % re.escape(klass), + video_webpage, klass, default=None) if count is not None: return int(count.replace(',', '')) return None @@ -1377,9 +1318,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'length_seconds' not in video_info: self._downloader.report_warning(u'unable to extract video duration') - video_duration = '' + video_duration = None else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) # annotations video_annotations = None @@ -1460,50 +1401,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url video_url_list = self._get_video_url_list(url_map) - if not video_url_list: - return elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) video_url_list = self._get_video_url_list(url_map) - if not video_url_list: - return - else: raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') - results = [] + formats = [] for itag, video_real_url in video_url_list: # Extension video_extension = self._video_extensions.get(itag, 'flv') + resolution = self._video_dimensions.get(itag, {}).get('display') + width = self._video_dimensions.get(itag, {}).get('width') + height = self._video_dimensions.get(itag, {}).get('height') + note = self._special_itags.get(itag) video_format = '{0} - {1}{2}'.format(itag if itag else video_extension, - self._video_dimensions.get(itag, '???'), + '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'), ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '') - results.append({ - 'id': video_id, - 'url': video_real_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'format_id': itag, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, + formats.append({ + 'url': video_real_url, + 'ext': video_extension, + 'format': video_format, + 'format_id': itag, + 'player_url': player_url, + '_resolution': resolution, + 'width': width, + 'height': height, + 'format_note': note, }) - return results + + def _formats_key(f): + note = f.get('format_note') + if note is None: + note = u'' + is_dash = u'DASH' in note + return ( + 0 if is_dash else 1, + f.get('height') if f.get('height') is not None else -1, + f.get('width') if f.get('width') is not None else -1) + formats.sort(key=_formats_key) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'upload_date': upload_date, + 'title': video_title, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'subtitles': video_subtitles, + 'duration': video_duration, + 'age_limit': 18 if age_gate else 0, + 'annotations': video_annotations, + 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, + } class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com playlists' @@ -1715,7 +1673,7 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - video_ids = [] + url_results = [] for pagenum in itertools.count(0): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 @@ -1733,10 +1691,17 @@ class YoutubeUserIE(InfoExtractor): break # Extract video identifiers - ids_in_page = [] - for entry in response['feed']['entry']: - ids_in_page.append(entry['id']['$t'].split('/')[-1]) - video_ids.extend(ids_in_page) + entries = response['feed']['entry'] + for entry in entries: + title = entry['title']['$t'] + video_id = entry['id']['$t'].split('/')[-1] + url_results.append({ + '_type': 'url', + 'url': video_id, + 'ie_key': 'Youtube', + 'id': 'video_id', + 'title': title, + }) # A little optimization - if current page is not # "full", ie. does not contain PAGE_SIZE video ids then @@ -1744,12 +1709,9 @@ class YoutubeUserIE(InfoExtractor): # are no more ids on further pages - no need to query # again. - if len(ids_in_page) < self._GDATA_PAGE_SIZE: + if len(entries) < self._GDATA_PAGE_SIZE: break - url_results = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] return self.playlist_result(url_results, playlist_title=username) |