diff options
Diffstat (limited to 'youtube_dl/extractor')
40 files changed, 1218 insertions, 837 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a39a1e2f4..f1167989e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -28,6 +28,7 @@ from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .clipsyndicate import ClipsyndicateIE +from .cmt import CMTIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE @@ -79,7 +80,10 @@ from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE -from .imdb import ImdbIE +from .imdb import ( + ImdbIE, + ImdbListIE +) from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE @@ -91,12 +95,18 @@ from .ivi import ( from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE +from .jpopsukitv import JpopsukiIE from .kankan import KankanIE from .keezmovies import KeezMoviesIE from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE, LivestreamOriginalIE +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .macgamestore import MacGameStoreIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE @@ -189,6 +199,7 @@ from .vimeo import ( VimeoUserIE, VimeoAlbumIE, VimeoGroupsIE, + VimeoReviewIE, ) from .vine import VineIE from .viki import VikiIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index ef5644aa5..e7361ae06 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -110,7 +110,8 @@ class AppleTrailersIE(InfoExtractor): 'width': format['width'], 'height': int(format['height']), }) - formats = sorted(formats, key=lambda f: (f['height'], f['width'])) + + self._sort_formats(formats) playlist.append({ '_type': 'video', diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3a32c14c5..15aee2786 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,14 +10,14 @@ from ..utils import ( class BandcampIE(InfoExtractor): - IE_NAME = u'Bandcamp' _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)' _TESTS = [{ u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', u'file': u'1812978515.mp3', - u'md5': u'cdeb30cdae1921719a3cbcab696ef53c', + u'md5': u'c557841d5e50261777a6585648adf439', u'info_dict': { - u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" + u"title": u"youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + u"duration": 10, }, u'skip': u'There is a limit of 200 free downloads / month for the test song' }] @@ -30,29 +30,42 @@ class BandcampIE(InfoExtractor): m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) - if m_trackinfo: - json_code = m_trackinfo.group(1) - data = json.loads(json_code) + if m_trackinfo: + json_code = m_trackinfo.group(1) + data = json.loads(json_code) + d = data[0] + + duration = int(round(d['duration'])) + formats = [] + for format_id, format_url in d['file'].items(): + ext, _, abr_str = format_id.partition('-') + + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': format_id.partition('-')[0], + 'vcodec': 'none', + 'acodec': format_id.partition('-')[0], + 'abr': int(format_id.partition('-')[2]), + }) + + self._sort_formats(formats) - for d in data: - formats = [{ - 'format_id': 'format_id', - 'url': format_url, - 'ext': format_id.partition('-')[0] - } for format_id, format_url in sorted(d['file'].items())] return { 'id': compat_str(d['id']), 'title': d['title'], 'formats': formats, + 'duration': duration, } - else: - raise ExtractorError(u'No free songs found') + else: + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) - id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', - webpage, re.MULTILINE|re.DOTALL).group('id') + video_id = re.search( + r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', + webpage, re.MULTILINE | re.DOTALL).group('id') - download_webpage = self._download_webpage(download_link, id, + download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') # We get the dictionary of the track from some javascrip code info = re.search(r'items: (.*?),$', @@ -66,21 +79,21 @@ class BandcampIE(InfoExtractor): m_url = re.match(re_url, initial_url) #We build the url we will use to get the final track url # This url is build in Bandcamp in the script download_bunde_*.js - request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) + request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts')) final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') # If we could correctly generate the .rand field the url would be #in the "download_url" key final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) - track_info = {'id':id, - 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, - 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] - } - - return [track_info] + return { + 'id': video_id, + 'title': info[u'title'], + 'ext': 'mp3', + 'vcodec': 'none', + 'url': final_url, + 'thumbnail': info[u'thumb_url'], + 'uploader': info[u'artist'], + } class BandcampAlbumIE(InfoExtractor): @@ -117,7 +130,7 @@ class BandcampAlbumIE(InfoExtractor): webpage = self._download_webpage(url, title) tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) if not tracks_paths: - raise ExtractorError(u'The page doesn\'t contain any track') + raise ExtractorError(u'The page doesn\'t contain any tracks') entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) for t_path in tracks_paths] diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 144ce64cc..0229840a3 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -61,9 +61,10 @@ class BlinkxIE(InfoExtractor): elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') + tbr = (int(m['vbr']) + int(m['abr'])) // 1000 format_id = (u'%s-%sk-%s' % (vcodec, - (int(m['vbr']) + int(m['abr'])) // 1000, + tbr, m['w'])) formats.append({ 'format_id': format_id, @@ -72,10 +73,12 @@ class BlinkxIE(InfoExtractor): 'acodec': acodec, 'abr': int(m['abr']) // 1000, 'vbr': int(m['vbr']) // 1000, + 'tbr': tbr, 'width': int(m['w']), 'height': int(m['h']), }) - formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr'])) + + self._sort_formats(formats) return { 'id': display_id, diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 5e33a69df..3ce9b5324 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -1,16 +1,15 @@ +from __future__ import unicode_literals + import datetime import json -import os import re import socket from .common import InfoExtractor from ..utils import ( compat_http_client, - compat_parse_qs, compat_str, compat_urllib_error, - compat_urllib_parse_urlparse, compat_urllib_request, ExtractorError, @@ -22,42 +21,35 @@ class BlipTVIE(InfoExtractor): """Information extractor for blip.tv""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' - _URL_EXT = r'^.*\.([a-z0-9]+)$' - IE_NAME = u'blip.tv' + _TEST = { - u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - u'file': u'5779306.m4v', - u'md5': u'80baf1ec5c3d2019037c1c707d676b9f', - u'info_dict': { - u"upload_date": u"20111205", - u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596", - u"uploader": u"Comic Book Resources - CBR TV", - u"title": u"CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" + 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', + 'file': '5779306.mov', + 'md5': 'c6934ad0b6acf2bd920720ec888eb812', + 'info_dict': { + 'upload_date': '20111205', + 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', + 'uploader': 'Comic Book Resources - CBR TV', + 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', } } def report_direct_download(self, title): """Report information extraction.""" - self.to_screen(u'%s: Direct download detected' % title) + self.to_screen('%s: Direct download detected' % title) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) # See https://github.com/rg3/youtube-dl/issues/857 - api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url) - if api_mobj is not None: - url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') - urlp = compat_urllib_parse_urlparse(url) - if urlp.path.startswith('/play/'): - response = self._request_webpage(url, None, False) - redirecturl = response.geturl() - rurlp = compat_urllib_parse_urlparse(redirecturl) - file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] - url = 'http://blip.tv/a/a-' + file_id - return self._real_extract(url) - + embed_mobj = re.search(r'^(?:https?://)?(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url) + if embed_mobj: + info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1) + info_page = self._download_webpage(info_url, embed_mobj.group(1)) + video_id = self._search_regex(r'data-episode-id="(\d+)', info_page, 'video_id') + return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV') if '?' in url: cchar = '&' @@ -67,67 +59,55 @@ class BlipTVIE(InfoExtractor): request = compat_urllib_request.Request(json_url) request.add_header('User-Agent', 'iTunes/10.6.1') self.report_extraction(mobj.group(1)) - info = None urlh = self._request_webpage(request, None, False, - u'unable to download video info webpage') - if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download - basename = url.split('/')[-1] - title,ext = os.path.splitext(basename) - title = title.decode('UTF-8') - ext = ext.replace('.', '') - self.report_direct_download(title) - info = { - 'id': title, - 'url': url, - 'uploader': None, - 'upload_date': None, - 'title': title, - 'ext': ext, - 'urlhandle': urlh + 'unable to download video info webpage') + + try: + json_code_bytes = urlh.read() + json_code = json_code_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError('Unable to read video info webpage: %s' % compat_str(err)) + + try: + json_data = json.loads(json_code) + if 'Post' in json_data: + data = json_data['Post'] + else: + data = json_data + + upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') + formats = [] + if 'additionalMedia' in data: + for f in sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])): + if not int(f['media_width']): # filter m3u8 + continue + formats.append({ + 'url': f['url'], + 'format_id': f['role'], + 'width': int(f['media_width']), + 'height': int(f['media_height']), + }) + else: + formats.append({ + 'url': data['media']['url'], + 'width': int(data['media']['width']), + 'height': int(data['media']['height']), + }) + + self._sort_formats(formats) + + return { + 'id': compat_str(data['item_id']), + 'uploader': data['display_name'], + 'upload_date': upload_date, + 'title': data['title'], + 'thumbnail': data['thumbnailUrl'], + 'description': data['description'], + 'user_agent': 'iTunes/10.6.1', + 'formats': formats, } - if info is None: # Regular URL - try: - json_code_bytes = urlh.read() - json_code = json_code_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) - - try: - json_data = json.loads(json_code) - if 'Post' in json_data: - data = json_data['Post'] - else: - data = json_data - - upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') - if 'additionalMedia' in data: - formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])) - best_format = formats[-1] - video_url = best_format['url'] - else: - video_url = data['media']['url'] - umobj = re.match(self._URL_EXT, video_url) - if umobj is None: - raise ValueError('Can not determine filename extension') - ext = umobj.group(1) - - info = { - 'id': compat_str(data['item_id']), - 'url': video_url, - 'uploader': data['display_name'], - 'upload_date': upload_date, - 'title': data['title'], - 'ext': ext, - 'format': data['media']['mimeType'], - 'thumbnail': data['thumbnailUrl'], - 'description': data['description'], - 'player_url': data['embedUrl'], - 'user_agent': 'iTunes/10.6.1', - } - except (ValueError,KeyError) as err: - raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) - - return [info] + except (ValueError, KeyError) as err: + raise ExtractorError('Unable to parse video information: %s' % repr(err)) class BlipTVUserIE(InfoExtractor): @@ -135,19 +115,19 @@ class BlipTVUserIE(InfoExtractor): _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' _PAGE_SIZE = 12 - IE_NAME = u'blip.tv:user' + IE_NAME = 'blip.tv:user' def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) username = mobj.group(1) page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - page = self._download_webpage(url, username, u'Downloading user page') + page = self._download_webpage(url, username, 'Downloading user page') mobj = re.search(r'data-users-id="([^"]+)"', page) page_base = page_base % mobj.group(1) @@ -163,7 +143,7 @@ class BlipTVUserIE(InfoExtractor): while True: url = page_base + "&page=" + str(pagenum) page = self._download_webpage(url, username, - u'Downloading video ids from page %d' % pagenum) + 'Downloading video ids from page %d' % pagenum) # Extract video identifiers ids_in_page = [] @@ -185,6 +165,6 @@ class BlipTVUserIE(InfoExtractor): pagenum += 1 - urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] + urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] return [self.playlist_result(url_entries, playlist_title = username)] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f7f0041c0..4ba3f7c42 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,4 +1,5 @@ # encoding: utf-8 +from __future__ import unicode_literals import re import json @@ -13,6 +14,7 @@ from ..utils import ( compat_urllib_request, ExtractorError, + unsmuggle_url, ) @@ -24,47 +26,47 @@ class BrightcoveIE(InfoExtractor): _TESTS = [ { # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ - u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', - u'file': u'2371591881001.mp4', - u'md5': u'5423e113865d26e40624dce2e4b45d95', - u'note': u'Test Brightcove downloads and detection in GenericIE', - u'info_dict': { - u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', - u'uploader': u'8TV', - u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', + 'file': '2371591881001.mp4', + 'md5': '5423e113865d26e40624dce2e4b45d95', + 'note': 'Test Brightcove downloads and detection in GenericIE', + 'info_dict': { + 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + 'uploader': '8TV', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', } }, { # From http://medianetwork.oracle.com/video/player/1785452137001 - u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', - u'file': u'1785452137001.flv', - u'info_dict': { - u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', - u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.', - u'uploader': u'Oracle', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', + 'file': '1785452137001.flv', + 'info_dict': { + 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', + 'uploader': 'Oracle', }, }, { # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ - u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', - u'info_dict': { - u'id': u'2750934548001', - u'ext': u'mp4', - u'title': u'This Bracelet Acts as a Personal Thermostat', - u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0', - u'uploader': u'Mashable', + 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', + 'info_dict': { + 'id': '2750934548001', + 'ext': 'mp4', + 'title': 'This Bracelet Acts as a Personal Thermostat', + 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', + 'uploader': 'Mashable', }, }, { # test that the default referer works # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ - u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', - u'info_dict': { - u'id': u'2878862109001', - u'ext': u'mp4', - u'title': u'Lost in Motion II', - u'description': u'md5:363109c02998fee92ec02211bd8000df', - u'uploader': u'National Ballet of Canada', + 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', + 'info_dict': { + 'id': '2878862109001', + 'ext': 'mp4', + 'title': 'Lost in Motion II', + 'description': 'md5:363109c02998fee92ec02211bd8000df', + 'uploader': 'National Ballet of Canada', }, }, ] @@ -80,10 +82,10 @@ class BrightcoveIE(InfoExtractor): object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>', lambda m: m.group(1) + '/>', object_str) # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 - object_str = object_str.replace(u'<--', u'<!--') + object_str = object_str.replace('<--', '<!--') object_doc = xml.etree.ElementTree.fromstring(object_str) - assert u'BrightcoveExperience' in object_doc.attrib['class'] + assert 'BrightcoveExperience' in object_doc.attrib['class'] params = {'flashID': object_doc.attrib['id'], 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } @@ -120,6 +122,8 @@ class BrightcoveIE(InfoExtractor): return None def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + # Change the 'videoId' and others field to '@videoPlayer' url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) # Change bckey (used by bcove.me urls) to playerKey @@ -130,9 +134,10 @@ class BrightcoveIE(InfoExtractor): videoPlayer = query.get('@videoPlayer') if videoPlayer: - return self._get_video_info(videoPlayer[0], query_str, query, - # We set the original url as the default 'Referer' header - referer=url) + # We set the original url as the default 'Referer' header + referer = smuggled_data.get('Referer', url) + return self._get_video_info( + videoPlayer[0], query_str, query, referer=referer) else: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -156,11 +161,11 @@ class BrightcoveIE(InfoExtractor): def _get_playlist_info(self, player_key): playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, - player_key, u'Downloading playlist information') + player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) if 'videoList' not in json_data: - raise ExtractorError(u'Empty playlist') + raise ExtractorError('Empty playlist') playlist_info = json_data['videoList'] videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] @@ -189,5 +194,5 @@ class BrightcoveIE(InfoExtractor): 'url': video_info['FLVFullLengthURL'], }) else: - raise ExtractorError(u'Unable to extract video url for %s' % info['id']) + raise ExtractorError('Unable to extract video url for %s' % info['id']) return info diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index ae70ea229..574881b70 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -76,14 +76,18 @@ class Channel9IE(InfoExtractor): </div>)? # File size part may be missing ''' # Extract known formats - formats = [{'url': x.group('url'), - 'format_id': x.group('quality'), - 'format_note': x.group('note'), - 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] - # Sort according to known formats list - formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + formats = [{ + 'url': x.group('url'), + 'format_id': x.group('quality'), + 'format_note': x.group('note'), + 'format': u'%s (%s)' % (x.group('quality'), x.group('note')), + 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate + 'preference': self._known_formats.index(x.group('quality')), + 'vcodec': 'none' if x.group('note') == 'Audio only' else None, + } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + + self._sort_formats(formats) + return formats def _extract_title(self, html): diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py new file mode 100644 index 000000000..88e0e9aba --- /dev/null +++ b/youtube_dl/extractor/cmt.py @@ -0,0 +1,19 @@ +from .mtv import MTVIE + +class CMTIE(MTVIE): + IE_NAME = u'cmt.com' + _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml' + _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' + + _TESTS = [ + { + u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', + u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2', + u'info_dict': { + u'id': u'989124', + u'ext': u'mp4', + u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"', + u'description': u'Blame It All On My Roots', + }, + }, + ] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a034bb2fb..ecac5e0e9 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -1,7 +1,10 @@ import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + int_or_none, + parse_duration, +) class CNNIE(InfoExtractor): @@ -15,6 +18,8 @@ class CNNIE(InfoExtractor): u'info_dict': { u'title': u'Nadal wins 8th French Open title', u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + u'duration': 135, + u'upload_date': u'20130609', }, }, { @@ -35,22 +40,58 @@ class CNNIE(InfoExtractor): info = self._download_xml(info_url, page_title) formats = [] + rex = re.compile(r'''(?x) + (?P<width>[0-9]+)x(?P<height>[0-9]+) + (?:_(?P<bitrate>[0-9]+)k)? + ''') for f in info.findall('files/file'): - mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) - if mf is not None: - formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) - formats = sorted(formats) - (_,_,_, video_path) = formats[-1] - video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) + fdct = { + 'format_id': f.attrib['bitrate'], + 'url': video_url, + } + + mf = rex.match(f.attrib['bitrate']) + if mf: + fdct['width'] = int(mf.group('width')) + fdct['height'] = int(mf.group('height')) + fdct['tbr'] = int_or_none(mf.group('bitrate')) + else: + mf = rex.search(f.text) + if mf: + fdct['width'] = int(mf.group('width')) + fdct['height'] = int(mf.group('height')) + fdct['tbr'] = int_or_none(mf.group('bitrate')) + else: + mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) + if mi: + if mi.group(1) == 'audio': + fdct['vcodec'] = 'none' + fdct['ext'] = 'm4a' + else: + fdct['tbr'] = int(mi.group(1)) + + formats.append(fdct) + + self._sort_formats(formats) thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] - return {'id': info.attrib['id'], - 'title': info.find('headline').text, - 'url': video_url, - 'ext': determine_ext(video_url), - 'thumbnail': thumbnails[-1][1], - 'thumbnails': thumbs_dict, - 'description': info.find('description').text, - } + metas_el = info.find('metas') + upload_date = ( + metas_el.attrib.get('version') if metas_el is not None else None) + + duration_el = info.find('length') + duration = parse_duration(duration_el.text) + + return { + 'id': info.attrib['id'], + 'title': info.find('headline').text, + 'formats': formats, + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + 'duration': duration, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index b27c1dfc5..d10b7bd0c 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,82 +1,68 @@ +from __future__ import unicode_literals + +import json import re from .common import InfoExtractor -from ..utils import ( - compat_urllib_parse_urlparse, - determine_ext, - - ExtractorError, -) class CollegeHumorIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' _TESTS = [{ - u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', - u'file': u'6902724.mp4', - u'md5': u'1264c12ad95dca142a9f0bf7968105a0', - u'info_dict': { - u'title': u'Comic-Con Cosplay Catastrophe', - u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', + 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', + 'file': '6902724.mp4', + 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd', + 'info_dict': { + 'title': 'Comic-Con Cosplay Catastrophe', + 'description': 'Fans get creative this year at San Diego. Too', + 'age_limit': 13, }, }, { - u'url': u'http://www.collegehumor.com/video/3505939/font-conference', - u'file': u'3505939.mp4', - u'md5': u'c51ca16b82bb456a4397987791a835f5', - u'info_dict': { - u'title': u'Font Conference', - u'description': u'This video wasn\'t long enough, so we made it double-spaced.', + 'url': 'http://www.collegehumor.com/video/3505939/font-conference', + 'file': '3505939.mp4', + 'md5': '72fa701d8ef38664a4dbb9e2ab721816', + 'info_dict': { + 'title': 'Font Conference', + 'description': 'This video wasn\'t long enough, so we made it double-spaced.', + 'age_limit': 10, }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') - info = { - 'id': video_id, - 'uploader': None, - 'upload_date': None, - } - - self.report_extraction(video_id) - xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - mdoc = self._download_xml(xmlUrl, video_id, - u'Downloading info XML', - u'Unable to download video info XML') + jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json' + data = json.loads(self._download_webpage( + jsonUrl, video_id, 'Downloading info JSON')) + vdata = data['video'] - try: - videoNode = mdoc.findall('./video')[0] - youtubeIdNode = videoNode.find('./youtubeID') - if youtubeIdNode is not None: - return self.url_result(youtubeIdNode.text, 'Youtube') - info['description'] = videoNode.findall('./description')[0].text - info['title'] = videoNode.findall('./caption')[0].text - info['thumbnail'] = videoNode.findall('./thumbnail')[0].text - next_url = videoNode.findall('./file')[0].text - except IndexError: - raise ExtractorError(u'Invalid metadata XML file') - - if next_url.endswith(u'manifest.f4m'): - manifest_url = next_url + '?hdcore=2.10.3' - adoc = self._download_xml(manifest_url, video_id, - u'Downloading XML manifest', - u'Unable to download video info XML') - - try: - video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text - except IndexError: - raise ExtractorError(u'Invalid manifest file') - url_pr = compat_urllib_parse_urlparse(info['thumbnail']) - info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') - info['ext'] = 'mp4' + AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0} + rating = vdata.get('rating') + if rating: + age_limit = AGE_LIMITS.get(rating.lower()) else: - # Old-style direct links - info['url'] = next_url - info['ext'] = determine_ext(info['url']) + age_limit = None # None = No idea + + PREFS = {'high_quality': 2, 'low_quality': 0} + formats = [] + for format_key in ('mp4', 'webm'): + for qname, qurl in vdata[format_key].items(): + formats.append({ + 'format_id': format_key + '_' + qname, + 'url': qurl, + 'format': format_key, + 'preference': PREFS.get(qname), + }) + self._sort_formats(formats) - return info + return { + 'id': video_id, + 'title': vdata['title'], + 'description': vdata.get('description'), + 'thumbnail': vdata.get('thumbnail'), + 'formats': formats, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index a54ce3ee7..27bd8256e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -12,7 +12,9 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' + _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ + (video-clips|episodes|cc-studios|video-collections) + /(?P<title>.*)''' _FEED_URL = u'http://comedycentral.com/feeds/mrss/' _TEST = { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ba46a7bc7..2a5e8076c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -9,6 +9,7 @@ import xml.etree.ElementTree from ..utils import ( compat_http_client, compat_urllib_error, + compat_urllib_parse_urlparse, compat_str, clean_html, @@ -37,10 +38,12 @@ class InfoExtractor(object): id: Video identifier. title: Video title, unescaped. - Additionally, it must contain either a formats entry or url and ext: + Additionally, it must contain either a formats entry or a url one: - formats: A list of dictionaries for each format available, it must - be ordered from worst to best quality. Potential fields: + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: * url Mandatory. The URL of the video file * ext Will be calculated from url if missing * format A human-readable description of the format @@ -48,23 +51,36 @@ class InfoExtractor(object): Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format - ("mp4_h264_opus" or "19") + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. * format_note Additional info about the format ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp" or so. + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field. + -1 for default (order by other properties), + -2 or smaller for less than default. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) player_url: SWF Player URL (used for rtmpdump). - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen The following fields are optional: @@ -244,6 +260,11 @@ class InfoExtractor(object): xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def report_warning(self, msg, video_id=None): + idstr = u'' if video_id is None else u'%s: ' % video_id + self._downloader.report_warning( + u'[%s] %s%s' % (self.IE_NAME, idstr, msg)) + def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) @@ -361,7 +382,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' - property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r'<meta[^>]+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -426,6 +447,57 @@ class InfoExtractor(object): } return RATING_TABLE.get(rating.lower(), None) + def _sort_formats(self, formats): + def _formats_key(f): + # TODO remove the following workaround + from ..utils import determine_ext + if not f.get('ext') and 'url' in f: + f['ext'] = determine_ext(f['url']) + + preference = f.get('preference') + if preference is None: + proto = f.get('protocol') + if proto is None: + proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme + + preference = 0 if proto in ['http', 'https'] else -0.1 + if f.get('ext') in ['f4f', 'f4m']: # Not yet supported + preference -= 0.5 + + if f.get('vcodec') == 'none': # audio only + if self._downloader.params.get('prefer_free_formats'): + ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus'] + else: + ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] + ext_preference = 0 + try: + audio_ext_preference = ORDER.index(f['ext']) + except ValueError: + audio_ext_preference = -1 + else: + if self._downloader.params.get('prefer_free_formats'): + ORDER = [u'flv', u'mp4', u'webm'] + else: + ORDER = [u'webm', u'flv', u'mp4'] + try: + ext_preference = ORDER.index(f['ext']) + except ValueError: + ext_preference = -1 + audio_ext_preference = 0 + + return ( + preference, + f.get('quality') if f.get('quality') is not None else -1, + f.get('height') if f.get('height') is not None else -1, + f.get('width') if f.get('width') is not None else -1, + ext_preference, + f.get('vbr') if f.get('vbr') is not None else -1, + f.get('abr') if f.get('abr') is not None else -1, + audio_ext_preference, + f.get('filesize') if f.get('filesize') is not None else -1, + f.get('format_id'), + ) + formats.sort(key=_formats_key) class SearchInfoExtractor(InfoExtractor): diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d5730684d..a2cbd4d8d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -1,20 +1,25 @@ +from __future__ import unicode_literals + +import json import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, + unescapeHTML, ) + class CSpanIE(InfoExtractor): _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' + IE_DESC = 'C-SPAN' _TEST = { - u'url': u'http://www.c-spanvideo.org/program/HolderonV', - u'file': u'315139.flv', - u'md5': u'74a623266956f69e4df0068ab6c80fe4', - u'info_dict': { - u"title": u"Attorney General Eric Holder on Voting Rights Act Decision" + 'url': 'http://www.c-spanvideo.org/program/HolderonV', + 'file': '315139.mp4', + 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'info_dict': { + 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', + 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', }, - u'skip': u'Requires rtmpdump' } def _real_extract(self, url): @@ -22,30 +27,22 @@ class CSpanIE(InfoExtractor): prog_name = mobj.group(1) webpage = self._download_webpage(url, prog_name) video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id') - data = compat_urllib_parse.urlencode({'programid': video_id, - 'dynamic':'1'}) - info_url = 'http://www.c-spanvideo.org/common/services/flashXml.php?' + data - video_info = self._download_webpage(info_url, video_id, u'Downloading video info') - - self.report_extraction(video_id) - - title = self._html_search_regex(r'<string name="title">(.*?)</string>', - video_info, 'title') - description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"', - webpage, 'description', - flags=re.MULTILINE|re.DOTALL) - - url = self._search_regex(r'<string name="URL">(.*?)</string>', - video_info, 'video url') - url = url.replace('$(protocol)', 'rtmp').replace('$(port)', '443') - path = self._search_regex(r'<string name="path">(.*?)</string>', - video_info, 'rtmp play path') - - return {'id': video_id, - 'title': title, - 'ext': 'flv', - 'url': url, - 'play_path': path, - 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage), - } + + title = self._html_search_regex( + r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title') + description = self._og_search_description(webpage) + + info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id + data_json = self._download_webpage( + info_url, video_id, 'Downloading video info') + data = json.loads(data_json) + + url = unescapeHTML(data['video']['files'][0]['path']['#text']) + + return { + 'id': video_id, + 'title': title, + 'url': url, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index cb7226f82..0b11d1f10 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -4,18 +4,17 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, unified_strdate, ) class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TEST = { u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", - u'file': u'36983.webm', - u'md5': u'57c97d0469d71cf874f6815aa2b7c944', + u'file': u'36983.mp4', + u'md5': u'9dcfe344732808dbfcc901537973c922', u'info_dict': { u"title": u"Kaffeeland Schweiz", u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", @@ -52,18 +51,12 @@ class DreiSatIE(InfoExtractor): 'width': int(fe.find('./width').text), 'height': int(fe.find('./height').text), 'url': fe.find('./url').text, - 'ext': determine_ext(fe.find('./url').text), 'filesize': int(fe.find('./filesize').text), 'video_bitrate': int(fe.find('./videoBitrate').text), - '3sat_qualityname': fe.find('./quality').text, } for fe in format_els if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')] - def _sortkey(format): - qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname']) - prefer_http = 1 if 'rtmp' in format['url'] else 0 - return (qidx, prefer_http, format['video_bitrate']) - formats.sort(key=_sortkey) + self._sort_formats(formats) return { '_type': 'video', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a14c98f9..7d0e117de 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1,9 +1,12 @@ # encoding: utf-8 +from __future__ import unicode_literals + import os import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( compat_urllib_error, compat_urllib_parse, @@ -22,78 +25,78 @@ from .ooyala import OoyalaIE class GenericIE(InfoExtractor): - IE_DESC = u'Generic downloader that works on some sites' + IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' - IE_NAME = u'generic' + IE_NAME = 'generic' _TESTS = [ { - u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - u'file': u'13601338388002.mp4', - u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd', - u'info_dict': { - u"uploader": u"www.hodiho.fr", - u"title": u"R\u00e9gis plante sa Jeep" + 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', + 'file': '13601338388002.mp4', + 'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd', + 'info_dict': { + 'uploader': 'www.hodiho.fr', + 'title': 'R\u00e9gis plante sa Jeep', } }, # embedded vimeo video { - u'add_ie': ['Vimeo'], - u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', - u'file': u'22444065.mp4', - u'md5': u'2903896e23df39722c33f015af0666e2', - u'info_dict': { - u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', - u"uploader_id": u"skillsmatter", - u"uploader": u"Skills Matter", + 'add_ie': ['Vimeo'], + 'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', + 'file': '22444065.mp4', + 'md5': '2903896e23df39722c33f015af0666e2', + 'info_dict': { + 'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', + 'uploader_id': 'skillsmatter', + 'uploader': 'Skills Matter', } }, # bandcamp page with custom domain { - u'add_ie': ['Bandcamp'], - u'url': u'http://bronyrock.com/track/the-pony-mash', - u'file': u'3235767654.mp3', - u'info_dict': { - u'title': u'The Pony Mash', - u'uploader': u'M_Pallante', + 'add_ie': ['Bandcamp'], + 'url': 'http://bronyrock.com/track/the-pony-mash', + 'file': '3235767654.mp3', + 'info_dict': { + 'title': 'The Pony Mash', + 'uploader': 'M_Pallante', }, - u'skip': u'There is a limit of 200 free downloads / month for the test song', + 'skip': 'There is a limit of 200 free downloads / month for the test song', }, # embedded brightcove video # it also tests brightcove videos that need to set the 'Referer' in the # http requests { - u'add_ie': ['Brightcove'], - u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - u'info_dict': { - u'id': u'2765128793001', - u'ext': u'mp4', - u'title': u'Le cours de bourse : l’analyse technique', - u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', - u'uploader': u'BFM BUSINESS', + 'add_ie': ['Brightcove'], + 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', + 'info_dict': { + 'id': '2765128793001', + 'ext': 'mp4', + 'title': 'Le cours de bourse : l’analyse technique', + 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', + 'uploader': 'BFM BUSINESS', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, }, # Direct link to a video { - u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4', - u'file': u'trailer.mp4', - u'md5': u'67d406c2bcb6af27fa886f31aa934bbe', - u'info_dict': { - u'id': u'trailer', - u'title': u'trailer', - u'upload_date': u'20100513', + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'file': 'trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'title': 'trailer', + 'upload_date': '20100513', } }, # ooyala video { - u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c', - u'info_dict': { - u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', - u'ext': u'mp4', - u'title': u'2cc213299525360.mov', #that's what we get + 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + 'md5': '5644c6ca5d5782c1d0d350dad9bd840c', + 'info_dict': { + 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', + 'ext': 'mp4', + 'title': '2cc213299525360.mov', #that's what we get }, }, ] @@ -101,12 +104,12 @@ class GenericIE(InfoExtractor): def report_download_webpage(self, video_id): """Report webpage download.""" if not self._downloader.params.get('test', False): - self._downloader.report_warning(u'Falling back on generic information extractor.') + self._downloader.report_warning('Falling back on generic information extractor.') super(GenericIE, self).report_download_webpage(video_id) def report_following_redirect(self, new_url): """Report information extraction.""" - self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) + self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) def _send_head(self, url): """Check if it is a redirect, like url shorteners, in case return the new url.""" @@ -152,7 +155,7 @@ class GenericIE(InfoExtractor): response = opener.open(HEADRequest(url)) if response is None: - raise ExtractorError(u'Invalid URL protocol') + raise ExtractorError('Invalid URL protocol') return response def _real_extract(self, url): @@ -162,6 +165,8 @@ class GenericIE(InfoExtractor): return self.url_result('http://' + url) video_id = os.path.splitext(url.split('/')[-1])[0] + self.to_screen('%s: Requesting header' % video_id) + try: response = self._send_head(url) @@ -184,7 +189,7 @@ class GenericIE(InfoExtractor): 'formats': [{ 'format_id': m.group('format_id'), 'url': url, - 'vcodec': u'none' if m.group('type') == 'audio' else None + 'vcodec': 'none' if m.group('type') == 'audio' else None }], 'upload_date': upload_date, } @@ -198,7 +203,7 @@ class GenericIE(InfoExtractor): except ValueError: # since this is the last-resort InfoExtractor, if # this error is thrown, it'll be thrown here - raise ExtractorError(u'Failed to download URL: %s' % url) + raise ExtractorError('Failed to download URL: %s' % url) self.report_extraction(video_id) @@ -209,18 +214,19 @@ class GenericIE(InfoExtractor): # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical video_title = self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, u'video title', - default=u'video') + r'(?s)<title>(.*?)</title>', webpage, 'video title', + default='video') # video uploader is domain name video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, u'video uploader') + r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') # Look for BrightCove: bc_url = BrightcoveIE._extract_brightcove_url(webpage) if bc_url is not None: - self.to_screen(u'Brightcove video detected.') - return self.url_result(bc_url, 'Brightcove') + self.to_screen('Brightcove video detected.') + surl = smuggle_url(bc_url, {'Referer': url}) + return self.url_result(surl, 'Brightcove') # Look for embedded (iframe) Vimeo player mobj = re.search( @@ -271,16 +277,12 @@ class GenericIE(InfoExtractor): } # Look for embedded blip.tv player - mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage) + mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) if mobj: - return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV') - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage) + return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage) if mobj: - player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1) - player_page = self._download_webpage(player_url, mobj.group(1)) - blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False) - if blip_video_id: - return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV') + return self.url_result(mobj.group(1), 'BlipTV') # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) @@ -308,6 +310,9 @@ class GenericIE(InfoExtractor): # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: + # Look for gorilla-vid style embedding + mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage) + if mobj is None: # Broaden the search a little bit mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) if mobj is None: @@ -327,23 +332,27 @@ class GenericIE(InfoExtractor): # HTML5 video mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) if mobj is None: - raise ExtractorError(u'Unsupported URL: %s' % url) + raise ExtractorError('Unsupported URL: %s' % url) # It's possible that one of the regexes # matched, but returned an empty group: if mobj.group(1) is None: - raise ExtractorError(u'Did not find a valid video URL at %s' % url) + raise ExtractorError('Did not find a valid video URL at %s' % url) video_url = mobj.group(1) video_url = compat_urlparse.urljoin(url, video_url) video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) + # Sometimes, jwplayer extraction will result in a YouTube URL + if YoutubeIE.suitable(video_url): + return self.url_result(video_url, 'Youtube') + # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] return { - 'id': video_id, - 'url': video_url, + 'id': video_id, + 'url': video_url, 'uploader': video_uploader, - 'title': video_title, + 'title': video_title, } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index e5332cce8..16926b4d3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -55,3 +55,32 @@ class ImdbIE(InfoExtractor): 'description': descr, 'thumbnail': format_info['slate'], } + +class ImdbListIE(InfoExtractor): + IE_NAME = u'imdb:list' + IE_DESC = u'Internet Movie Database lists' + _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + list_id = mobj.group('id') + + # RSS XML is sometimes malformed + rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, u'Downloading list RSS') + list_title = self._html_search_regex(r'<title>(.*?)</title>', rss, u'list title') + + # Export is independent of actual author_id, but returns 404 if no author_id is provided. + # However, passing dummy author_id seems to be enough. + csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id, + list_id, u'Downloading list CSV') + + entries = [] + for item in csv.split('\n')[1:]: + cols = item.split(',') + if len(cols) < 2: + continue + item_id = cols[1][1:-1] + if item_id.startswith('vi'): + entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb')) + + return self.playlist_result(entries, list_id, list_title)
\ No newline at end of file diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 16a6f73c8..4ddda2f1b 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -5,7 +5,6 @@ from ..utils import ( compat_urlparse, compat_urllib_parse, xpath_with_ns, - determine_ext, ) @@ -63,13 +62,17 @@ class InternetVideoArchiveIE(InfoExtractor): for content in item.findall(_bp('media:group/media:content')): attr = content.attrib f_url = attr['url'] + width = int(attr['width']) + bitrate = int(attr['bitrate']) + format_id = '%d-%dk' % (width, bitrate) formats.append({ + 'format_id': format_id, 'url': f_url, - 'ext': determine_ext(f_url), - 'width': int(attr['width']), - 'bitrate': int(attr['bitrate']), + 'width': width, + 'tbr': bitrate, }) - formats = sorted(formats, key=lambda f: f['bitrate']) + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 4bdf55f93..98d1d272a 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -84,14 +84,16 @@ class IviIE(InfoExtractor): result = video_json[u'result'] - formats = [{'url': x[u'url'], - 'format_id': x[u'content_format'] - } for x in result[u'files'] if x[u'content_format'] in self._known_formats] - formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) - - if len(formats) == 0: - self._downloader.report_warning(u'No media links available for %s' % video_id) - return + formats = [{ + 'url': x[u'url'], + 'format_id': x[u'content_format'], + 'preference': self._known_formats.index(x[u'content_format']), + } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + + self._sort_formats(formats) + + if not formats: + raise ExtractorError(u'No media links available for %s' % video_id) duration = result[u'duration'] compilation = result[u'compilation'] diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py new file mode 100644 index 000000000..aad782578 --- /dev/null +++ b/youtube_dl/extractor/jpopsukitv.py @@ -0,0 +1,73 @@ +# coding=utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class JpopsukiIE(InfoExtractor): + IE_NAME = 'jpopsuki.tv' + _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' + + _TEST = { + 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', + 'md5': '88018c0c1a9b1387940e90ec9e7e198e', + 'file': '00be659d23b0b40508169cdee4545771.mp4', + 'info_dict': { + 'id': '00be659d23b0b40508169cdee4545771', + 'title': 'ayumi hamasaki - evolution', + 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', + 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', + 'uploader': 'plama_chan', + 'uploader_id': '404', + 'upload_date': '20121101' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_url = 'http://www.jpopsuki.tv' + self._html_search_regex( + r'<source src="(.*?)" type', webpage, 'video url') + + video_title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r'<li>from: <a href="/user/view/user/(.*?)/uid/', + webpage, 'video uploader', fatal=False) + uploader_id = self._html_search_regex( + r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)', + webpage, 'video uploader_id', fatal=False) + upload_date = self._html_search_regex( + r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', + fatal=False) + if upload_date is not None: + upload_date = unified_strdate(upload_date) + view_count_str = self._html_search_regex( + r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count', + fatal=False) + comment_count_str = self._html_search_regex( + r'<h2>([0-9]+?) comments</h2>', webpage, 'video comment_count', + fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': int_or_none(view_count_str), + 'comment_count': int_or_none(comment_count_str), + } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py new file mode 100644 index 000000000..844ba4dcb --- /dev/null +++ b/youtube_dl/extractor/lynda.py @@ -0,0 +1,142 @@ +from __future__ import unicode_literals + +import re +import json + +from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor +from ..utils import ExtractorError + + +class LyndaIE(SubtitlesInfoExtractor): + IE_NAME = 'lynda' + IE_DESC = 'lynda.com videos' + _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' + + _TEST = { + 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'file': '114408.mp4', + 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', + u"info_dict": { + 'title': 'Using the exercise files', + 'duration': 68 + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, + video_id, 'Downloading video JSON') + video_json = json.loads(page) + + if 'Status' in video_json and video_json['Status'] == 'NotFound': + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + if video_json['HasAccess'] is False: + raise ExtractorError('Video %s is only available for members' % video_id, expected=True) + + video_id = video_json['ID'] + duration = video_json['DurationInSeconds'] + title = video_json['Title'] + + formats = [{'url': fmt['Url'], + 'ext': fmt['Extension'], + 'width': fmt['Width'], + 'height': fmt['Height'], + 'filesize': fmt['FileSize'], + 'format_id': str(fmt['Resolution']) + } for fmt in video_json['Formats']] + + self._sort_formats(formats) + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, page) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page)) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'subtitles': subtitles, + 'formats': formats + } + + _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' + + def _fix_subtitles(self, subtitles): + fixed_subtitles = {} + for k, v in subtitles.items(): + subs = json.loads(v) + if len(subs) == 0: + continue + srt = '' + for pos in range(0, len(subs) - 1): + seq_current = subs[pos] + m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) + if m_current is None: + continue + seq_next = subs[pos+1] + m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) + if m_next is None: + continue + appear_time = m_current.group('timecode') + disappear_time = m_next.group('timecode') + text = seq_current['Caption'] + srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) + if srt: + fixed_subtitles[k] = srt + return fixed_subtitles + + def _get_available_subtitles(self, video_id, webpage): + url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id + sub = self._download_webpage(url, None, note=False) + sub_json = json.loads(sub) + return {'en': url} if len(sub_json) > 0 else {} + + +class LyndaCourseIE(InfoExtractor): + IE_NAME = 'lynda:course' + IE_DESC = 'lynda.com online courses' + + # Course link equals to welcome/introduction video link of same course + # We will recognize it as course link + _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_path = mobj.group('coursepath') + course_id = mobj.group('courseid') + + page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, + course_id, 'Downloading course JSON') + course_json = json.loads(page) + + if 'Status' in course_json and course_json['Status'] == 'NotFound': + raise ExtractorError('Course %s does not exist' % course_id, expected=True) + + unaccessible_videos = 0 + videos = [] + + for chapter in course_json['Chapters']: + for video in chapter['Videos']: + if video['HasAccess'] is not True: + unaccessible_videos += 1 + continue + videos.append(video['ID']) + + if unaccessible_videos > 0: + self._downloader.report_warning('%s videos are only available for members and will not be downloaded' % unaccessible_videos) + + entries = [ + self.url_result('http://www.lynda.com/%s/%s-4.html' % + (course_path, video_id), + 'Lynda') + for video_id in videos] + + course_title = course_json['Title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py new file mode 100644 index 000000000..b818cf50c --- /dev/null +++ b/youtube_dl/extractor/macgamestore.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class MacGameStoreIE(InfoExtractor): + IE_NAME = 'macgamestore' + IE_DESC = 'MacGameStore trailers' + _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', + 'file': '2450.m4v', + 'md5': '8649b8ea684b6666b4c5be736ecddc61', + 'info_dict': { + 'title': 'Crow', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id, 'Downloading trailer page') + + if re.search(r'>Missing Media<', webpage) is not None: + raise ExtractorError('Trailer %s does not exist' % video_id, expected=True) + + video_title = self._html_search_regex( + r'<title>MacGameStore: (.*?) Trailer</title>', webpage, 'title') + + video_url = self._html_search_regex( + r'(?s)<div\s+id="video-player".*?href="([^"]+)"\s*>', + webpage, 'video URL') + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title + } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 08ce0647f..7aa0080d7 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -52,10 +52,11 @@ class MDRIE(InfoExtractor): 'format_id': u'%s-%d' % (media_type, vbr), }) formats.append(format) - formats.sort(key=lambda f: (f.get('vbr'), f['abr'])) if not formats: raise ExtractorError(u'Could not find any valid formats') + self._sort_formats(formats) + return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 52be9232f..76b717fe5 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -33,8 +33,18 @@ class TechTVMITIE(InfoExtractor): raw_page, u'base url') formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, u'video formats') - formats = json.loads(formats_json) - formats = sorted(formats, key=lambda f: f['bitrate']) + formats_mit = json.loads(formats_json) + formats = [ + { + 'format_id': f['label'], + 'url': base_url + f['url'].partition(':')[2], + 'ext': f['url'].partition(':')[0], + 'format': f['label'], + 'width': f['width'], + 'vbr': f['bitrate'], + } + for f in formats_mit + ] title = get_element_by_id('edit-title', clean_page) description = clean_html(get_element_by_id('edit-description', clean_page)) @@ -43,8 +53,7 @@ class TechTVMITIE(InfoExtractor): return {'id': video_id, 'title': title, - 'url': base_url + formats[-1]['url'].replace('mp4:', ''), - 'ext': 'mp4', + 'formats': formats, 'description': description, 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 125d81551..7c54ea0f4 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -53,7 +53,7 @@ class MixcloudIE(InfoExtractor): info = json.loads(json_data) preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') - song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') + song_url = preview_url.replace('/previews/', '/c/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) final_song_url = self._get_url(template_url) if final_song_url is None: diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed11f521a..f1cf41e2d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -129,7 +129,7 @@ class MTVIE(MTVServicesInfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - uri = mobj.group('mgid') + uri = mobj.groupdict().get('mgid') if uri is None: webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 0404e6e43..6d35c7861 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -143,8 +143,10 @@ class MyVideoIE(InfoExtractor): if mobj: video_url = compat_urllib_parse.unquote(mobj.group(1)) if 'myvideo2flash' in video_url: - self._downloader.report_warning(u'forcing RTMPT ...') - video_url = video_url.replace('rtmpe://', 'rtmpt://') + self.report_warning( + u'Rewriting URL to use unencrypted rtmp:// ...', + video_id) + video_url = video_url.replace('rtmpe://', 'rtmp://') if not video_url: # extract non rtmp videos diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index b42eae89a..88f03608b 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,54 +1,98 @@ # coding: utf-8 +from __future__ import unicode_literals -import re -import xml.etree.ElementTree import json +import re from .common import InfoExtractor from ..utils import ( - compat_urlparse, - ExtractorError, - find_xpath_attr, + HEADRequest, + unified_strdate, ) + class ORFIE(InfoExtractor): - _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' + _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)' + + _TEST = { + 'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747', + 'file': '7319747.mp4', + 'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375', + 'info_dict': { + 'title': 'Was Sie schon immer über Klassik wissen wollten', + 'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4', + 'duration': 3508, + 'upload_date': '20140105', + }, + 'skip': 'Blocked outside of Austria', + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) - flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') - flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] - flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) - playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') - playlist = json.loads(playlist_json) - - videos = [] - ns = '{http://tempuri.org/XMLSchema.xsd}' - xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} - webpage_description = self._og_search_description(webpage) - for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): - # Get best quality url - rtmp_url = None - for q in ['Q6A', 'Q4A', 'Q1A']: - video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) - if video_url is not None: - rtmp_url = video_url.text - break - if rtmp_url is None: - raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) - description = self._html_search_regex( - r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage, - u'description', default=webpage_description, flags=re.DOTALL) - videos.append({ + data_json = self._search_regex( + r'initializeAdworx\((.+?)\);\n', webpage, 'video info') + all_data = json.loads(data_json) + sdata = all_data[0]['values']['segments'] + + def quality_to_int(s): + m = re.search('([0-9]+)', s) + if m is None: + return -1 + return int(m.group(1)) + + entries = [] + for sd in sdata: + video_id = sd['id'] + formats = [{ + 'preference': -10 if fd['delivery'] == 'hls' else None, + 'format_id': '%s-%s-%s' % ( + fd['delivery'], fd['quality'], fd['quality_string']), + 'url': fd['src'], + 'protocol': fd['protocol'], + 'quality': quality_to_int(fd['quality']), + } for fd in sd['playlist_item_array']['sources']] + + # Check for geoblocking. + # There is a property is_geoprotection, but that's always false + geo_str = sd.get('geoprotection_string') + if geo_str: + try: + http_url = next( + f['url'] + for f in formats + if re.match(r'^https?://.*\.mp4$', f['url'])) + except StopIteration: + pass + else: + req = HEADRequest(http_url) + response = self._request_webpage( + req, video_id, + note='Testing for geoblocking', + errnote=(( + 'This video seems to be blocked outside of %s. ' + 'You may want to try the streaming-* formats.') + % geo_str), + fatal=False) + + self._sort_formats(formats) + + upload_date = unified_strdate(sd['created_date']) + entries.append({ '_type': 'video', - 'id': info['id'], - 'title': info['title'], - 'url': rtmp_url, - 'ext': 'flv', - 'description': description, - }) - - return videos + 'id': video_id, + 'title': sd['header'], + 'formats': formats, + 'description': sd.get('description'), + 'duration': int(sd['duration_in_seconds']), + 'upload_date': upload_date, + 'thumbnail': sd.get('image_full_url'), + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': playlist_id, + } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 71abd5013..e9ff8d1af 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -5,7 +5,7 @@ from ..utils import compat_urllib_parse class PornHdIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' + _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' _TEST = { u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', u'file': u'1962.flv', diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index a589a893b..99f5b19d2 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,5 +1,6 @@ # encoding: utf-8 +import os.path import re import json import hashlib @@ -10,6 +11,7 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, ExtractorError, + url_basename, ) @@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor): # We will extract some from the video web page instead video_page_url = 'http://' + mobj.group('url') video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') - + + # Warning if video is unavailable + warning = self._html_search_regex( + r'<div class="videoUnModer">(.*?)</div>', video_page, + u'warning message', default=None) + if warning is not None: + self._downloader.report_warning( + u'Video %s may not be available; smotri said: %s ' % + (video_id, warning)) + # Adult content if re.search(u'EroConfirmText">', video_page) is not None: self.report_age_confirmation() @@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor): # Extract the rest of meta data video_title = self._search_meta(u'name', video_page, u'title') if not video_title: - video_title = video_url.rsplit('/', 1)[-1] + video_title = os.path.splitext(url_basename(video_url))[0] video_description = self._search_meta(u'description', video_page) END_TEXT = u' на сайте Smotri.com' - if video_description.endswith(END_TEXT): + if video_description and video_description.endswith(END_TEXT): video_description = video_description[:-len(END_TEXT)] START_TEXT = u'Смотреть онлайн ролик ' - if video_description.startswith(START_TEXT): + if video_description and video_description.startswith(START_TEXT): video_description = video_description[len(START_TEXT):] video_thumbnail = self._search_meta(u'thumbnail', video_page) upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') - upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) - video_upload_date = ( - ( - upload_date_m.group('year') + - upload_date_m.group('month') + - upload_date_m.group('day') + if upload_date_str: + upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) + video_upload_date = ( + ( + upload_date_m.group('year') + + upload_date_m.group('month') + + upload_date_m.group('day') + ) + if upload_date_m else None ) - if upload_date_m else None - ) + else: + video_upload_date = None duration_str = self._search_meta(u'duration', video_page) - duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) - video_duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) + if duration_str: + duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) + video_duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m else None ) - if duration_m else None - ) + else: + video_duration = None video_uploader = self._html_search_regex( u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index e22ff9c38..951e977bd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): (?!sets/)(?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) - |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) + |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = u'soundcloud' @@ -193,7 +193,7 @@ class SoundcloudIE(InfoExtractor): if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id - elif mobj.group('widget'): + elif mobj.group('player'): query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) return self.url_result(query['url'][0], ie='Soundcloud') else: diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 695520524..051a34d5b 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -51,9 +51,10 @@ class SpiegelIE(InfoExtractor): # Blacklist type 6, it's extremely LQ and not available on the same server if n.tag.startswith('type') and n.tag != 'type6' ] - formats.sort(key=lambda f: f['vbr']) duration = float(idoc[0].findall('./duration')[0].text) + self._sort_formats(formats) + info = { 'id': video_id, 'title': video_title, diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index cec65261b..23172143e 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -55,15 +55,21 @@ class ThePlatformIE(InfoExtractor): formats = [] for f in switch.findall(_x('smil:video')): attr = f.attrib + width = int(attr['width']) + height = int(attr['height']) + vbr = int(attr['system-bitrate']) // 1000 + format_id = '%dx%d_%dk' % (width, height, vbr) formats.append({ + 'format_id': format_id, 'url': base_url, 'play_path': 'mp4:' + attr['src'], 'ext': 'flv', - 'width': int(attr['width']), - 'height': int(attr['height']), - 'vbr': int(attr['system-bitrate']), + 'width': width, + 'height': height, + 'vbr': vbr, }) - formats.sort(key=lambda f: (f['height'], f['width'], f['vbr'])) + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py index 3cf8c853d..b1c854a64 100644 --- a/youtube_dl/extractor/veehd.py +++ b/youtube_dl/extractor/veehd.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -8,16 +10,17 @@ from ..utils import ( clean_html, ) + class VeeHDIE(InfoExtractor): _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)' _TEST = { - u'url': u'http://veehd.com/video/4686958', - u'file': u'4686958.mp4', - u'info_dict': { - u'title': u'Time Lapse View from Space ( ISS)', - u'uploader_id': u'spotted', - u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7', + 'url': 'http://veehd.com/video/4686958', + 'file': '4686958.mp4', + 'info_dict': { + 'title': 'Time Lapse View from Space ( ISS)', + 'uploader_id': 'spotted', + 'description': 'md5:f0094c4cf3a72e22bc4e4239ef767ad7', }, } @@ -25,24 +28,30 @@ class VeeHDIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + # VeeHD seems to send garbage on the first request. + # See https://github.com/rg3/youtube-dl/issues/2102 + self._download_webpage(url, video_id, 'Requesting webpage') webpage = self._download_webpage(url, video_id) - player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"', - webpage, u'player path') + player_path = self._search_regex( + r'\$\("#playeriframe"\).attr\({src : "(.+?)"', + webpage, 'player path') player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage(player_url, video_id, - u'Downloading player page') - config_json = self._search_regex(r'value=\'config=({.+?})\'', - player_page, u'config json') + + self._download_webpage(player_url, video_id, 'Requesting player page') + player_page = self._download_webpage( + player_url, video_id, 'Downloading player page') + config_json = self._search_regex( + r'value=\'config=({.+?})\'', player_page, 'config json') config = json.loads(config_json) video_url = compat_urlparse.unquote(config['clip']['url']) title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>', - webpage, u'uploader') + webpage, 'uploader') thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"', - webpage, u'thumbnail') + webpage, 'thumbnail') description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul', - webpage, u'description', flags=re.DOTALL) + webpage, 'description', flags=re.DOTALL) return { '_type': 'video', diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 00672c9e5..baa57f343 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -1,22 +1,22 @@ +from __future__ import unicode_literals + import re import json from .common import InfoExtractor -from ..utils import ( - determine_ext, -) + class VeohIE(InfoExtractor): - _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)' + _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)' _TEST = { - u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3', - u'file': u'56314296.mp4', - u'md5': u'620e68e6a3cff80086df3348426c9ca3', - u'info_dict': { - u'title': u'Straight Backs Are Stronger', - u'uploader': u'LUMOback', - u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', + 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', + 'file': '56314296.mp4', + 'md5': '620e68e6a3cff80086df3348426c9ca3', + 'info_dict': { + 'title': 'Straight Backs Are Stronger', + 'uploader': 'LUMOback', + 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', } } @@ -28,20 +28,20 @@ class VeohIE(InfoExtractor): m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage) if m_youtube is not None: youtube_id = m_youtube.group(1) - self.to_screen(u'%s: detected Youtube video.' % video_id) + self.to_screen('%s: detected Youtube video.' % video_id) return self.url_result(youtube_id, 'Youtube') self.report_extraction(video_id) info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info') info = json.loads(info) - video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath') - - return {'id': info['videoId'], - 'title': info['title'], - 'ext': determine_ext(video_url), - 'url': video_url, - 'uploader': info['username'], - 'thumbnail': info.get('highResImage') or info.get('medResImage'), - 'description': info['description'], - 'view_count': info['views'], - } + video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath') + + return { + 'id': info['videoId'], + 'title': info['title'], + 'url': video_url, + 'uploader': info['username'], + 'thumbnail': info.get('highResImage') or info.get('medResImage'), + 'description': info['description'], + 'view_count': info['views'], + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c3623fcbe..ad86d033a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import json import re import itertools @@ -31,54 +33,55 @@ class VimeoIE(InfoExtractor): (?P<id>[0-9]+) /?(?:[?&].*)?(?:[#].*)?$''' _NETRC_MACHINE = 'vimeo' - IE_NAME = u'vimeo' + IE_NAME = 'vimeo' _TESTS = [ { - u'url': u'http://vimeo.com/56015672#at=0', - u'file': u'56015672.mp4', - u'md5': u'8879b6cc097e987f02484baf890129e5', - u'info_dict': { - u"upload_date": u"20121220", - u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - u"uploader_id": u"user7108434", - u"uploader": u"Filippo Valsorda", - u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + 'url': 'http://vimeo.com/56015672#at=0', + 'file': '56015672.mp4', + 'md5': '8879b6cc097e987f02484baf890129e5', + 'info_dict': { + "upload_date": "20121220", + "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + "uploader_id": "user7108434", + "uploader": "Filippo Valsorda", + "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", }, }, { - u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', - u'file': u'68093876.mp4', - u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82', - u'note': u'Vimeo Pro video (#1197)', - u'info_dict': { - u'uploader_id': u'openstreetmapus', - u'uploader': u'OpenStreetMap US', - u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', + 'file': '68093876.mp4', + 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', + 'note': 'Vimeo Pro video (#1197)', + 'info_dict': { + 'uploader_id': 'openstreetmapus', + 'uploader': 'OpenStreetMap US', + 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', }, }, { - u'url': u'http://player.vimeo.com/video/54469442', - u'file': u'54469442.mp4', - u'md5': u'619b811a4417aa4abe78dc653becf511', - u'note': u'Videos that embed the url in the player page', - u'info_dict': { - u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', - u'uploader': u'The BLN & Business of Software', + 'url': 'http://player.vimeo.com/video/54469442', + 'file': '54469442.mp4', + 'md5': '619b811a4417aa4abe78dc653becf511', + 'note': 'Videos that embed the url in the player page', + 'info_dict': { + 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', + 'uploader': 'The BLN & Business of Software', + 'uploader_id': 'theblnbusinessofsoftware', }, }, { - u'url': u'http://vimeo.com/68375962', - u'file': u'68375962.mp4', - u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7', - u'note': u'Video protected with password', - u'info_dict': { - u'title': u'youtube-dl password protected test video', - u'upload_date': u'20130614', - u'uploader_id': u'user18948128', - u'uploader': u'Jaime Marquínez Ferrándiz', + 'url': 'http://vimeo.com/68375962', + 'file': '68375962.mp4', + 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', + 'note': 'Video protected with password', + 'info_dict': { + 'title': 'youtube-dl password protected test video', + 'upload_date': '20130614', + 'uploader_id': 'user18948128', + 'uploader': 'Jaime Marquínez Ferrándiz', }, - u'params': { - u'videopassword': u'youtube-dl', + 'params': { + 'videopassword': 'youtube-dl', }, }, ] @@ -90,7 +93,7 @@ class VimeoIE(InfoExtractor): self.report_login() login_url = 'https://vimeo.com/log_in' webpage = self._download_webpage(login_url, None, False) - token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) + token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') data = compat_urllib_parse.urlencode({'email': username, 'password': password, 'action': 'login', @@ -100,13 +103,13 @@ class VimeoIE(InfoExtractor): login_request = compat_urllib_request.Request(login_url, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Cookie', 'xsrft=%s' % token) - self._download_webpage(login_request, None, False, u'Wrong login info') + self._download_webpage(login_request, None, False, 'Wrong login info') def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword', None) if password is None: - raise ExtractorError(u'This video is protected by a password, use the --video-password option') - token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) + raise ExtractorError('This video is protected by a password, use the --video-password option') + token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') data = compat_urllib_parse.urlencode({'password': password, 'token': token}) # I didn't manage to use the password with https @@ -118,8 +121,8 @@ class VimeoIE(InfoExtractor): password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') password_request.add_header('Cookie', 'xsrft=%s' % token) self._download_webpage(password_request, video_id, - u'Verifying the password', - u'Wrong password') + 'Verifying the password', + 'Wrong password') def _real_initialize(self): self._login() @@ -134,7 +137,7 @@ class VimeoIE(InfoExtractor): # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group('id') if mobj.group('pro') or mobj.group('player'): @@ -155,7 +158,7 @@ class VimeoIE(InfoExtractor): try: try: config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, u'config URL') + r' data-config-url="(.+?)"', webpage, 'config URL') config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: @@ -166,19 +169,23 @@ class VimeoIE(InfoExtractor): config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config = self._search_regex(config_re, webpage, u'info section', + config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') + raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') if re.search('<form[^>]+?id="pw_form"', webpage) is not None: self._verify_video_password(url, video_id, webpage) return self._real_extract(url) else: - raise ExtractorError(u'Unable to extract info section', + raise ExtractorError('Unable to extract info section', cause=e) + else: + if config.get('view') == 4: + self._verify_video_password(url, video_id, webpage) + return self._real_extract(url) # Extract title video_title = config["video"]["title"] @@ -212,9 +219,9 @@ class VimeoIE(InfoExtractor): video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count')) + view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) + like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) + comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) except RegexNotFoundError: # This info is only available in vimeo.com/{id} urls view_count = None @@ -255,7 +262,7 @@ class VimeoIE(InfoExtractor): for key in ('other', 'sd', 'hd'): formats += files[key] if len(formats) == 0: - raise ExtractorError(u'No known codec found') + raise ExtractorError('No known codec found') return { 'id': video_id, @@ -274,7 +281,7 @@ class VimeoIE(InfoExtractor): class VimeoChannelIE(InfoExtractor): - IE_NAME = u'vimeo:channel' + IE_NAME = 'vimeo:channel' _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)' _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' @@ -283,14 +290,14 @@ class VimeoChannelIE(InfoExtractor): return '%s/videos/page:%d/' % (base_url, pagenum) def _extract_list_title(self, webpage): - return self._html_search_regex(self._TITLE_RE, webpage, u'list title') + return self._html_search_regex(self._TITLE_RE, webpage, 'list title') def _extract_videos(self, list_id, base_url): video_ids = [] for pagenum in itertools.count(1): webpage = self._download_webpage( self._page_url(base_url, pagenum) ,list_id, - u'Downloading page %s' % pagenum) + 'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -310,8 +317,8 @@ class VimeoChannelIE(InfoExtractor): class VimeoUserIE(VimeoChannelIE): - IE_NAME = u'vimeo:user' - _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)' + IE_NAME = 'vimeo:user' + _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)(?:[#?]|$)' _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' @classmethod @@ -327,7 +334,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): - IE_NAME = u'vimeo:album' + IE_NAME = 'vimeo:album' _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' @@ -336,12 +343,12 @@ class VimeoAlbumIE(VimeoChannelIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - album_id = mobj.group('id') + album_id = mobj.group('id') return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id) class VimeoGroupsIE(VimeoAlbumIE): - IE_NAME = u'vimeo:group' + IE_NAME = 'vimeo:group' _VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)' def _extract_list_title(self, webpage): @@ -351,3 +358,24 @@ class VimeoGroupsIE(VimeoAlbumIE): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name) + + +class VimeoReviewIE(InfoExtractor): + IE_NAME = 'vimeo:review' + IE_DESC = 'Review pages on vimeo' + _VALID_URL = r'(?:https?://)?vimeo.\com/[^/]+/review/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', + 'file': '75524534.mp4', + 'md5': 'c507a72f780cacc12b2248bb4006d253', + 'info_dict': { + 'title': "DICK HARDWICK 'Comedian'", + 'uploader': 'Richard Hardwick', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + player_url = 'https://player.vimeo.com/player/' + video_id + return self.url_result(player_url, 'Vimeo', video_id) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index e1748c261..bc31c2e64 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -44,8 +44,10 @@ class WistiaIE(InfoExtractor): 'height': a['height'], 'filesize': a['size'], 'ext': a['ext'], + 'preference': 1 if atype == 'original' else None, }) - formats.sort(key=lambda a: a['filesize']) + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 5c9c361b9..e17a39782 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -6,8 +6,8 @@ from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( compat_urllib_parse, compat_urlparse, - determine_ext, clean_html, + int_or_none, ) @@ -68,9 +68,9 @@ class YahooIE(InfoExtractor): formats = [] for s in info['streams']: format_info = { - 'width': s.get('width'), - 'height': s.get('height'), - 'bitrate': s.get('bitrate'), + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'tbr': int_or_none(s.get('bitrate')), } host = s['host'] @@ -84,10 +84,10 @@ class YahooIE(InfoExtractor): else: format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url - format_info['ext'] = determine_ext(format_url) formats.append(format_info) - formats = sorted(formats, key=lambda f:(f['height'], f['width'])) + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index bd0f2cae0..77ad423c4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,5 +1,4 @@ import json -import os import re import sys @@ -16,6 +15,7 @@ from ..aes import ( aes_decrypt_text ) + class YouPornIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' _TEST = { @@ -23,9 +23,9 @@ class YouPornIE(InfoExtractor): u'file': u'505835.mp4', u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', u'info_dict': { - u"upload_date": u"20101221", - u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", - u"uploader": u"Ask Dan And Jennifer", + u"upload_date": u"20101221", + u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", + u"uploader": u"Ask Dan And Jennifer", u"title": u"Sex Ed: Is It Safe To Masturbate Daily?", u"age_limit": 18, } @@ -71,38 +71,36 @@ class YouPornIE(InfoExtractor): link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') links.append(link) - if not links: - raise ExtractorError(u'ERROR: no known formats available for video') - formats = [] for link in links: - # A link looks like this: # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 # A path looks like this: # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 video_url = unescapeHTML(link) path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[4].split('_')[:2] + format_parts = path.split('/')[4].split('_')[:2] - # size = format[0] - # bitrate = format[1] - format = "-".join(format) - # title = u'%s-%s-%s' % (video_title, size, bitrate) + dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0] + + resolution = format_parts[0] + height = int(resolution[:-len('p')]) + bitrate = int(format_parts[1][:-len('k')]) + format = u'-'.join(format_parts) + u'-' + dn formats.append({ 'url': video_url, - 'ext': extension, 'format': format, 'format_id': format, + 'height': height, + 'tbr': bitrate, + 'resolution': resolution, }) - # Sort and remove doubles - formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) - for i in range(len(formats)-1,0,-1): - if formats[i]['format_id'] == formats[i-1]['format_id']: - del formats[i] + self._sort_formats(formats) + + if not formats: + raise ExtractorError(u'ERROR: no known formats available for video') return { 'id': video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a68576547..9424d5e26 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -150,168 +150,69 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', - # Apple HTTP Live Streaming - '96', '95', '94', '93', '92', '132', '151', - # 3D - '85', '84', '102', '83', '101', '82', '100', - # Dash video - '138', '137', '248', '136', '247', '135', '246', - '245', '244', '134', '243', '133', '242', '160', - # Dash audio - '141', '172', '140', '171', '139', - ] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', - # Apple HTTP Live Streaming - '96', '95', '94', '93', '92', '132', '151', - # 3D - '85', '102', '84', '101', '83', '100', '82', - # Dash video - '138', '248', '137', '247', '136', '246', '245', - '244', '135', '243', '134', '242', '133', '160', - # Dash audio - '172', '141', '171', '140', '139', - ] - _video_formats_map = { - 'flv': ['35', '34', '6', '5'], - '3gp': ['36', '17', '13'], - 'mp4': ['38', '37', '22', '18'], - 'webm': ['46', '45', '44', '43'], - } - _video_extensions = { - '13': '3gp', - '17': '3gp', - '18': 'mp4', - '22': 'mp4', - '36': '3gp', - '37': 'mp4', - '38': 'mp4', - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', + _formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240}, + '6': {'ext': 'flv', 'width': 450, 'height': 270}, + '13': {'ext': '3gp'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720}, + '34': {'ext': 'flv', 'width': 640, 'height': 360}, + '35': {'ext': 'flv', 'width': 854, 'height': 480}, + '36': {'ext': '3gp', 'width': 320, 'height': 240}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072}, + '43': {'ext': 'webm', 'width': 640, 'height': 360}, + '44': {'ext': 'webm', 'width': 854, 'height': 480}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, + # 3d videos - '82': 'mp4', - '83': 'mp4', - '84': 'mp4', - '85': 'mp4', - '100': 'webm', - '101': 'webm', - '102': 'webm', + '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, # Apple HTTP Live Streaming - '92': 'mp4', - '93': 'mp4', - '94': 'mp4', - '95': 'mp4', - '96': 'mp4', - '132': 'mp4', - '151': 'mp4', - - # Dash mp4 - '133': 'mp4', - '134': 'mp4', - '135': 'mp4', - '136': 'mp4', - '137': 'mp4', - '138': 'mp4', - '160': 'mp4', + '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40}, + '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40}, + '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40}, + '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40}, + '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, + '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40}, + '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, # Dash mp4 audio - '139': 'm4a', - '140': 'm4a', - '141': 'm4a', + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, # Dash webm - '171': 'webm', - '172': 'webm', - '242': 'webm', - '243': 'webm', - '244': 'webm', - '245': 'webm', - '246': 'webm', - '247': 'webm', - '248': 'webm', - } - _video_dimensions = { - '5': '400x240', - '6': '???', - '13': '???', - '17': '176x144', - '18': '640x360', - '22': '1280x720', - '34': '640x360', - '35': '854x480', - '36': '320x240', - '37': '1920x1080', - '38': '4096x3072', - '43': '640x360', - '44': '854x480', - '45': '1280x720', - '46': '1920x1080', - '82': '360p', - '83': '480p', - '84': '720p', - '85': '1080p', - '92': '240p', - '93': '360p', - '94': '480p', - '95': '720p', - '96': '1080p', - '100': '360p', - '101': '480p', - '102': '720p', - '132': '240p', - '151': '72p', - '133': '240p', - '134': '360p', - '135': '480p', - '136': '720p', - '137': '1080p', - '138': '>1080p', - '139': '48k', - '140': '128k', - '141': '256k', - '160': '192p', - '171': '128k', - '172': '256k', - '242': '240p', - '243': '360p', - '244': '480p', - '245': '480p', - '246': '480p', - '247': '720p', - '248': '1080p', - } - _special_itags = { - '82': '3D', - '83': '3D', - '84': '3D', - '85': '3D', - '100': '3D', - '101': '3D', - '102': '3D', - '133': 'DASH Video', - '134': 'DASH Video', - '135': 'DASH Video', - '136': 'DASH Video', - '137': 'DASH Video', - '138': 'DASH Video', - '139': 'DASH Audio', - '140': 'DASH Audio', - '141': 'DASH Audio', - '160': 'DASH Video', - '171': 'DASH Audio', - '172': 'DASH Audio', - '242': 'DASH Video', - '243': 'DASH Video', - '244': 'DASH Video', - '245': 'DASH Video', - '246': 'DASH Video', - '247': 'DASH Video', - '248': 'DASH Video', + '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, + '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, + '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, + '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, + '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, + '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40}, + '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40}, + + # Dash webm audio + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, + '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50}, } IE_NAME = u'youtube' @@ -1153,13 +1054,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self._downloader.report_warning(err_msg) return {} - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'), - self._video_dimensions.get(x, '???'), - ' ('+self._special_itags[x]+')' if x in self._special_itags else '')) - def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: @@ -1172,48 +1066,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): Transform a dictionary in the format {itag:url} to a list of (itag, url) with the requested formats. """ - req_format = self._downloader.params.get('format', None) - format_limit = self._downloader.params.get('format_limit', None) - available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats - if format_limit is not None and format_limit in available_formats: - format_list = available_formats[available_formats.index(format_limit):] - else: - format_list = available_formats - existing_formats = [x for x in format_list if x in url_map] + existing_formats = [x for x in self._formats if x in url_map] if len(existing_formats) == 0: raise ExtractorError(u'no known formats available for video') - if self._downloader.params.get('listformats', None): - self._print_formats(existing_formats) - return - if req_format is None or req_format == 'best': - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality - elif req_format in ('-1', 'all'): - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific formats. We pick the first in a slash-delimeted sequence. - # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality - # available in the specified format. For example, - # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. - # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. - # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. - req_formats = req_format.split('/') - video_url_list = None - for rf in req_formats: - if rf in url_map: - video_url_list = [(rf, url_map[rf])] - break - if rf in self._video_formats_map: - for srf in self._video_formats_map[rf]: - if srf in url_map: - video_url_list = [(srf, url_map[srf])] - break - else: - continue - break - if video_url_list is None: - raise ExtractorError(u'requested format not available') + video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats + video_url_list.reverse() # order worst to best return video_url_list def _extract_from_m3u8(self, manifest_url, video_id): @@ -1462,50 +1319,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url video_url_list = self._get_video_url_list(url_map) - if not video_url_list: - return elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) video_url_list = self._get_video_url_list(url_map) - if not video_url_list: - return - else: raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') - results = [] + formats = [] for itag, video_real_url in video_url_list: - # Extension - video_extension = self._video_extensions.get(itag, 'flv') - - video_format = '{0} - {1}{2}'.format(itag if itag else video_extension, - self._video_dimensions.get(itag, '???'), - ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '') - - results.append({ - 'id': video_id, - 'url': video_real_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, + dct = { 'format_id': itag, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - }) - return results + 'url': video_real_url, + 'player_url': player_url, + } + dct.update(self._formats[itag]) + formats.append(dct) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'upload_date': upload_date, + 'title': video_title, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'subtitles': video_subtitles, + 'duration': video_duration, + 'age_limit': 18 if age_gate else 0, + 'annotations': video_annotations, + 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, + } class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com playlists' diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 35ece354a..829f002cf 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,10 +1,10 @@ # coding: utf-8 -import operator import re from .common import InfoExtractor from ..utils import ( + int_or_none, unified_strdate, ) @@ -67,29 +67,13 @@ class ZDFIE(InfoExtractor): ''', format_id) ext = format_m.group('container') - is_supported = ext != 'f4f' - - PROTO_ORDER = ['http', 'rtmp', 'rtsp'] - try: - proto_pref = -PROTO_ORDER.index(format_m.group('proto')) - except ValueError: - proto_pref = -999 + proto = format_m.group('proto').lower() quality = fnode.find('./quality').text - QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] - try: - quality_pref = -QUALITY_ORDER.index(quality) - except ValueError: - quality_pref = -999 - abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 - pref = (is_available, is_supported, - proto_pref, quality_pref, vbr, abr) format_note = u'' - if not is_supported: - format_note += u'(unsupported)' if not format_note: format_note = None @@ -101,18 +85,20 @@ class ZDFIE(InfoExtractor): 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': int(fnode.find('./width').text), - 'height': int(fnode.find('./height').text), - 'filesize': int(fnode.find('./filesize').text), + 'width': int_or_none(fnode.find('./width').text), + 'height': int_or_none(fnode.find('./height').text), + 'filesize': int_or_none(fnode.find('./filesize').text), 'format_note': format_note, - '_pref': pref, + 'protocol': proto, '_available': is_available, } format_nodes = doc.findall('.//formitaeten/formitaet') - formats = sorted(filter(lambda f: f['_available'], - map(xml_to_format, format_nodes)), - key=operator.itemgetter('_pref')) + formats = list(filter( + lambda f: f['_available'], + map(xml_to_format, format_nodes))) + + self._sort_formats(formats) return { 'id': video_id, |