diff options
Diffstat (limited to 'youtube_dl/extractor')
23 files changed, 391 insertions, 137 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2caa078b5..21d564dba 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -28,6 +28,7 @@ from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .clipsyndicate import ClipsyndicateIE +from .cmt import CMTIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE @@ -79,7 +80,10 @@ from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE -from .imdb import ImdbIE +from .imdb import ( + ImdbIE, + ImdbListIE +) from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE @@ -91,6 +95,7 @@ from .ivi import ( from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE +from .jpopsukitv import JpopsukiIE from .kankan import KankanIE from .keezmovies import KeezMoviesIE from .kickstarter import KickStarterIE @@ -101,6 +106,7 @@ from .lynda import ( LyndaIE, LyndaCourseIE ) +from .macgamestore import MacGameStoreIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3a32c14c5..15aee2786 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,14 +10,14 @@ from ..utils import ( class BandcampIE(InfoExtractor): - IE_NAME = u'Bandcamp' _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)' _TESTS = [{ u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', u'file': u'1812978515.mp3', - u'md5': u'cdeb30cdae1921719a3cbcab696ef53c', + u'md5': u'c557841d5e50261777a6585648adf439', u'info_dict': { - u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" + u"title": u"youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + u"duration": 10, }, u'skip': u'There is a limit of 200 free downloads / month for the test song' }] @@ -30,29 +30,42 @@ class BandcampIE(InfoExtractor): m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) - if m_trackinfo: - json_code = m_trackinfo.group(1) - data = json.loads(json_code) + if m_trackinfo: + json_code = m_trackinfo.group(1) + data = json.loads(json_code) + d = data[0] + + duration = int(round(d['duration'])) + formats = [] + for format_id, format_url in d['file'].items(): + ext, _, abr_str = format_id.partition('-') + + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': format_id.partition('-')[0], + 'vcodec': 'none', + 'acodec': format_id.partition('-')[0], + 'abr': int(format_id.partition('-')[2]), + }) + + self._sort_formats(formats) - for d in data: - formats = [{ - 'format_id': 'format_id', - 'url': format_url, - 'ext': format_id.partition('-')[0] - } for format_id, format_url in sorted(d['file'].items())] return { 'id': compat_str(d['id']), 'title': d['title'], 'formats': formats, + 'duration': duration, } - else: - raise ExtractorError(u'No free songs found') + else: + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) - id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', - webpage, re.MULTILINE|re.DOTALL).group('id') + video_id = re.search( + r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', + webpage, re.MULTILINE | re.DOTALL).group('id') - download_webpage = self._download_webpage(download_link, id, + download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') # We get the dictionary of the track from some javascrip code info = re.search(r'items: (.*?),$', @@ -66,21 +79,21 @@ class BandcampIE(InfoExtractor): m_url = re.match(re_url, initial_url) #We build the url we will use to get the final track url # This url is build in Bandcamp in the script download_bunde_*.js - request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) + request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts')) final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') # If we could correctly generate the .rand field the url would be #in the "download_url" key final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) - track_info = {'id':id, - 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, - 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] - } - - return [track_info] + return { + 'id': video_id, + 'title': info[u'title'], + 'ext': 'mp3', + 'vcodec': 'none', + 'url': final_url, + 'thumbnail': info[u'thumb_url'], + 'uploader': info[u'artist'], + } class BandcampAlbumIE(InfoExtractor): @@ -117,7 +130,7 @@ class BandcampAlbumIE(InfoExtractor): webpage = self._download_webpage(url, title) tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) if not tracks_paths: - raise ExtractorError(u'The page doesn\'t contain any track') + raise ExtractorError(u'The page doesn\'t contain any tracks') entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) for t_path in tracks_paths] diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 144ce64cc..0229840a3 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -61,9 +61,10 @@ class BlinkxIE(InfoExtractor): elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') + tbr = (int(m['vbr']) + int(m['abr'])) // 1000 format_id = (u'%s-%sk-%s' % (vcodec, - (int(m['vbr']) + int(m['abr'])) // 1000, + tbr, m['w'])) formats.append({ 'format_id': format_id, @@ -72,10 +73,12 @@ class BlinkxIE(InfoExtractor): 'acodec': acodec, 'abr': int(m['abr']) // 1000, 'vbr': int(m['vbr']) // 1000, + 'tbr': tbr, 'width': int(m['w']), 'height': int(m['h']), }) - formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr'])) + + self._sort_formats(formats) return { 'id': display_id, diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index ae70ea229..574881b70 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -76,14 +76,18 @@ class Channel9IE(InfoExtractor): </div>)? # File size part may be missing ''' # Extract known formats - formats = [{'url': x.group('url'), - 'format_id': x.group('quality'), - 'format_note': x.group('note'), - 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] - # Sort according to known formats list - formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + formats = [{ + 'url': x.group('url'), + 'format_id': x.group('quality'), + 'format_note': x.group('note'), + 'format': u'%s (%s)' % (x.group('quality'), x.group('note')), + 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate + 'preference': self._known_formats.index(x.group('quality')), + 'vcodec': 'none' if x.group('note') == 'Audio only' else None, + } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + + self._sort_formats(formats) + return formats def _extract_title(self, html): diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py new file mode 100644 index 000000000..88e0e9aba --- /dev/null +++ b/youtube_dl/extractor/cmt.py @@ -0,0 +1,19 @@ +from .mtv import MTVIE + +class CMTIE(MTVIE): + IE_NAME = u'cmt.com' + _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml' + _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' + + _TESTS = [ + { + u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', + u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2', + u'info_dict': { + u'id': u'989124', + u'ext': u'mp4', + u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"', + u'description': u'Blame It All On My Roots', + }, + }, + ] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a034bb2fb..ecac5e0e9 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -1,7 +1,10 @@ import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + int_or_none, + parse_duration, +) class CNNIE(InfoExtractor): @@ -15,6 +18,8 @@ class CNNIE(InfoExtractor): u'info_dict': { u'title': u'Nadal wins 8th French Open title', u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + u'duration': 135, + u'upload_date': u'20130609', }, }, { @@ -35,22 +40,58 @@ class CNNIE(InfoExtractor): info = self._download_xml(info_url, page_title) formats = [] + rex = re.compile(r'''(?x) + (?P<width>[0-9]+)x(?P<height>[0-9]+) + (?:_(?P<bitrate>[0-9]+)k)? + ''') for f in info.findall('files/file'): - mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) - if mf is not None: - formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) - formats = sorted(formats) - (_,_,_, video_path) = formats[-1] - video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) + fdct = { + 'format_id': f.attrib['bitrate'], + 'url': video_url, + } + + mf = rex.match(f.attrib['bitrate']) + if mf: + fdct['width'] = int(mf.group('width')) + fdct['height'] = int(mf.group('height')) + fdct['tbr'] = int_or_none(mf.group('bitrate')) + else: + mf = rex.search(f.text) + if mf: + fdct['width'] = int(mf.group('width')) + fdct['height'] = int(mf.group('height')) + fdct['tbr'] = int_or_none(mf.group('bitrate')) + else: + mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) + if mi: + if mi.group(1) == 'audio': + fdct['vcodec'] = 'none' + fdct['ext'] = 'm4a' + else: + fdct['tbr'] = int(mi.group(1)) + + formats.append(fdct) + + self._sort_formats(formats) thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] - return {'id': info.attrib['id'], - 'title': info.find('headline').text, - 'url': video_url, - 'ext': determine_ext(video_url), - 'thumbnail': thumbnails[-1][1], - 'thumbnails': thumbs_dict, - 'description': info.find('description').text, - } + metas_el = info.find('metas') + upload_date = ( + metas_el.attrib.get('version') if metas_el is not None else None) + + duration_el = info.find('length') + duration = parse_duration(duration_el.text) + + return { + 'id': info.attrib['id'], + 'title': info.find('headline').text, + 'formats': formats, + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + 'duration': duration, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index a54ce3ee7..27bd8256e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -12,7 +12,9 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' + _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ + (video-clips|episodes|cc-studios|video-collections) + /(?P<title>.*)''' _FEED_URL = u'http://comedycentral.com/feeds/mrss/' _TEST = { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6fa60622e..f498bcf6f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -51,7 +51,8 @@ class InfoExtractor(object): Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format - ("mp4_h264_opus" or "19") + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. * format_note Additional info about the format ("3D" or "DASH video") * width Width of the video, if known @@ -68,7 +69,8 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp" or so. * preference Order number of this format. If this field is - present, the formats get sorted by this field. + present and not None, the formats get sorted + by this field. -1 for default (order by other properties), -2 or smaller for less than default. url: Final video URL. @@ -376,7 +378,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' - property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r'<meta[^>]+?%s[^>]+?%s' return [ template % (property_re, content_re), diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 416e25156..0b11d1f10 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -10,11 +10,11 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TEST = { u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", - u'file': u'36983.webm', - u'md5': u'57c97d0469d71cf874f6815aa2b7c944', + u'file': u'36983.mp4', + u'md5': u'9dcfe344732808dbfcc901537973c922', u'info_dict': { u"title": u"Kaffeeland Schweiz", u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a14c98f9..377ae91c4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -162,6 +162,8 @@ class GenericIE(InfoExtractor): return self.url_result('http://' + url) video_id = os.path.splitext(url.split('/')[-1])[0] + self.to_screen(u'%s: Requesting header' % video_id) + try: response = self._send_head(url) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index e5332cce8..16926b4d3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -55,3 +55,32 @@ class ImdbIE(InfoExtractor): 'description': descr, 'thumbnail': format_info['slate'], } + +class ImdbListIE(InfoExtractor): + IE_NAME = u'imdb:list' + IE_DESC = u'Internet Movie Database lists' + _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + list_id = mobj.group('id') + + # RSS XML is sometimes malformed + rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, u'Downloading list RSS') + list_title = self._html_search_regex(r'<title>(.*?)</title>', rss, u'list title') + + # Export is independent of actual author_id, but returns 404 if no author_id is provided. + # However, passing dummy author_id seems to be enough. + csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id, + list_id, u'Downloading list CSV') + + entries = [] + for item in csv.split('\n')[1:]: + cols = item.split(',') + if len(cols) < 2: + continue + item_id = cols[1][1:-1] + if item_id.startswith('vi'): + entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb')) + + return self.playlist_result(entries, list_id, list_title)
\ No newline at end of file diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 16a6f73c8..4ddda2f1b 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -5,7 +5,6 @@ from ..utils import ( compat_urlparse, compat_urllib_parse, xpath_with_ns, - determine_ext, ) @@ -63,13 +62,17 @@ class InternetVideoArchiveIE(InfoExtractor): for content in item.findall(_bp('media:group/media:content')): attr = content.attrib f_url = attr['url'] + width = int(attr['width']) + bitrate = int(attr['bitrate']) + format_id = '%d-%dk' % (width, bitrate) formats.append({ + 'format_id': format_id, 'url': f_url, - 'ext': determine_ext(f_url), - 'width': int(attr['width']), - 'bitrate': int(attr['bitrate']), + 'width': width, + 'tbr': bitrate, }) - formats = sorted(formats, key=lambda f: f['bitrate']) + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 4bdf55f93..98d1d272a 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -84,14 +84,16 @@ class IviIE(InfoExtractor): result = video_json[u'result'] - formats = [{'url': x[u'url'], - 'format_id': x[u'content_format'] - } for x in result[u'files'] if x[u'content_format'] in self._known_formats] - formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) - - if len(formats) == 0: - self._downloader.report_warning(u'No media links available for %s' % video_id) - return + formats = [{ + 'url': x[u'url'], + 'format_id': x[u'content_format'], + 'preference': self._known_formats.index(x[u'content_format']), + } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + + self._sort_formats(formats) + + if not formats: + raise ExtractorError(u'No media links available for %s' % video_id) duration = result[u'duration'] compilation = result[u'compilation'] diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py new file mode 100644 index 000000000..aad782578 --- /dev/null +++ b/youtube_dl/extractor/jpopsukitv.py @@ -0,0 +1,73 @@ +# coding=utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class JpopsukiIE(InfoExtractor): + IE_NAME = 'jpopsuki.tv' + _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' + + _TEST = { + 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', + 'md5': '88018c0c1a9b1387940e90ec9e7e198e', + 'file': '00be659d23b0b40508169cdee4545771.mp4', + 'info_dict': { + 'id': '00be659d23b0b40508169cdee4545771', + 'title': 'ayumi hamasaki - evolution', + 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', + 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', + 'uploader': 'plama_chan', + 'uploader_id': '404', + 'upload_date': '20121101' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_url = 'http://www.jpopsuki.tv' + self._html_search_regex( + r'<source src="(.*?)" type', webpage, 'video url') + + video_title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r'<li>from: <a href="/user/view/user/(.*?)/uid/', + webpage, 'video uploader', fatal=False) + uploader_id = self._html_search_regex( + r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)', + webpage, 'video uploader_id', fatal=False) + upload_date = self._html_search_regex( + r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', + fatal=False) + if upload_date is not None: + upload_date = unified_strdate(upload_date) + view_count_str = self._html_search_regex( + r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count', + fatal=False) + comment_count_str = self._html_search_regex( + r'<h2>([0-9]+?) comments</h2>', webpage, 'video comment_count', + fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': int_or_none(view_count_str), + 'comment_count': int_or_none(comment_count_str), + } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index dd59aa3e6..592ed747a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -6,17 +8,17 @@ from ..utils import ExtractorError class LyndaIE(InfoExtractor): - IE_NAME = u'lynda' - IE_DESC = u'lynda.com videos' + IE_NAME = 'lynda' + IE_DESC = 'lynda.com videos' _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' _TEST = { - u'url': u'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - u'file': u'114408.mp4', - u'md5': u'ecfc6862da89489161fb9cd5f5a6fac1', + 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'file': '114408.mp4', + 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', u"info_dict": { - u'title': u'Using the exercise files', - u'duration': 68 + 'title': 'Using the exercise files', + 'duration': 68 } } @@ -25,26 +27,26 @@ class LyndaIE(InfoExtractor): video_id = mobj.group(1) page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, - video_id, u'Downloading video JSON') + video_id, 'Downloading video JSON') video_json = json.loads(page) - if u'Status' in video_json and video_json[u'Status'] == u'NotFound': - raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) + if 'Status' in video_json and video_json['Status'] == 'NotFound': + raise ExtractorError('Video %s does not exist' % video_id, expected=True) - if video_json[u'HasAccess'] is False: - raise ExtractorError(u'Video %s is only available for members' % video_id, expected=True) + if video_json['HasAccess'] is False: + raise ExtractorError('Video %s is only available for members' % video_id, expected=True) - video_id = video_json[u'ID'] - duration = video_json[u'DurationInSeconds'] - title = video_json[u'Title'] + video_id = video_json['ID'] + duration = video_json['DurationInSeconds'] + title = video_json['Title'] - formats = [{'url': fmt[u'Url'], - 'ext': fmt[u'Extension'], - 'width': fmt[u'Width'], - 'height': fmt[u'Height'], - 'filesize': fmt[u'FileSize'], - 'format_id': fmt[u'Resolution'] - } for fmt in video_json[u'Formats']] + formats = [{'url': fmt['Url'], + 'ext': fmt['Extension'], + 'width': fmt['Width'], + 'height': fmt['Height'], + 'filesize': fmt['FileSize'], + 'format_id': fmt['Resolution'] + } for fmt in video_json['Formats']] self._sort_formats(formats) @@ -57,8 +59,8 @@ class LyndaIE(InfoExtractor): class LyndaCourseIE(InfoExtractor): - IE_NAME = u'lynda:course' - IE_DESC = u'lynda.com online courses' + IE_NAME = 'lynda:course' + IE_DESC = 'lynda.com online courses' # Course link equals to welcome/introduction video link of same course # We will recognize it as course link @@ -70,27 +72,31 @@ class LyndaCourseIE(InfoExtractor): course_id = mobj.group('courseid') page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, - course_id, u'Downloading course JSON') + course_id, 'Downloading course JSON') course_json = json.loads(page) - if u'Status' in course_json and course_json[u'Status'] == u'NotFound': - raise ExtractorError(u'Course %s does not exist' % course_id, expected=True) + if 'Status' in course_json and course_json['Status'] == 'NotFound': + raise ExtractorError('Course %s does not exist' % course_id, expected=True) unaccessible_videos = 0 videos = [] - for chapter in course_json[u'Chapters']: - for video in chapter[u'Videos']: - if video[u'HasAccess'] is not True: + for chapter in course_json['Chapters']: + for video in chapter['Videos']: + if video['HasAccess'] is not True: unaccessible_videos += 1 continue - videos.append(video[u'ID']) + videos.append(video['ID']) if unaccessible_videos > 0: - self._downloader.report_warning(u'%s videos are only available for members and will not be downloaded' % unaccessible_videos) + self._downloader.report_warning('%s videos are only available for members and will not be downloaded' % unaccessible_videos) - entries = [self.url_result('http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), 'Lynda') for video_id in videos] + entries = [ + self.url_result('http://www.lynda.com/%s/%s-4.html' % + (course_path, video_id), + 'Lynda') + for video_id in videos] - course_title = course_json[u'Title'] + course_title = course_json['Title'] - return self.playlist_result(entries, course_id, course_title)
\ No newline at end of file + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py new file mode 100644 index 000000000..b818cf50c --- /dev/null +++ b/youtube_dl/extractor/macgamestore.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class MacGameStoreIE(InfoExtractor): + IE_NAME = 'macgamestore' + IE_DESC = 'MacGameStore trailers' + _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', + 'file': '2450.m4v', + 'md5': '8649b8ea684b6666b4c5be736ecddc61', + 'info_dict': { + 'title': 'Crow', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id, 'Downloading trailer page') + + if re.search(r'>Missing Media<', webpage) is not None: + raise ExtractorError('Trailer %s does not exist' % video_id, expected=True) + + video_title = self._html_search_regex( + r'<title>MacGameStore: (.*?) Trailer</title>', webpage, 'title') + + video_url = self._html_search_regex( + r'(?s)<div\s+id="video-player".*?href="([^"]+)"\s*>', + webpage, 'video URL') + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 125d81551..7c54ea0f4 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -53,7 +53,7 @@ class MixcloudIE(InfoExtractor): info = json.loads(json_data) preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') - song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') + song_url = preview_url.replace('/previews/', '/c/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) final_song_url = self._get_url(template_url) if final_song_url is None: diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed11f521a..f1cf41e2d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -129,7 +129,7 @@ class MTVIE(MTVServicesInfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - uri = mobj.group('mgid') + uri = mobj.groupdict().get('mgid') if uri is None: webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index e22ff9c38..951e977bd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): (?!sets/)(?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) - |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) + |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = u'soundcloud' @@ -193,7 +193,7 @@ class SoundcloudIE(InfoExtractor): if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id - elif mobj.group('widget'): + elif mobj.group('player'): query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) return self.url_result(query['url'][0], ie='Soundcloud') else: diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index cec65261b..23172143e 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -55,15 +55,21 @@ class ThePlatformIE(InfoExtractor): formats = [] for f in switch.findall(_x('smil:video')): attr = f.attrib + width = int(attr['width']) + height = int(attr['height']) + vbr = int(attr['system-bitrate']) // 1000 + format_id = '%dx%d_%dk' % (width, height, vbr) formats.append({ + 'format_id': format_id, 'url': base_url, 'play_path': 'mp4:' + attr['src'], 'ext': 'flv', - 'width': int(attr['width']), - 'height': int(attr['height']), - 'vbr': int(attr['system-bitrate']), + 'width': width, + 'height': height, + 'vbr': vbr, }) - formats.sort(key=lambda f: (f['height'], f['width'], f['vbr'])) + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 584550455..bc31c2e64 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -44,6 +44,7 @@ class WistiaIE(InfoExtractor): 'height': a['height'], 'filesize': a['size'], 'ext': a['ext'], + 'preference': 1 if atype == 'original' else None, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index bd0f2cae0..77ad423c4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,5 +1,4 @@ import json -import os import re import sys @@ -16,6 +15,7 @@ from ..aes import ( aes_decrypt_text ) + class YouPornIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' _TEST = { @@ -23,9 +23,9 @@ class YouPornIE(InfoExtractor): u'file': u'505835.mp4', u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', u'info_dict': { - u"upload_date": u"20101221", - u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", - u"uploader": u"Ask Dan And Jennifer", + u"upload_date": u"20101221", + u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", + u"uploader": u"Ask Dan And Jennifer", u"title": u"Sex Ed: Is It Safe To Masturbate Daily?", u"age_limit": 18, } @@ -71,38 +71,36 @@ class YouPornIE(InfoExtractor): link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') links.append(link) - if not links: - raise ExtractorError(u'ERROR: no known formats available for video') - formats = [] for link in links: - # A link looks like this: # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 # A path looks like this: # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 video_url = unescapeHTML(link) path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[4].split('_')[:2] + format_parts = path.split('/')[4].split('_')[:2] - # size = format[0] - # bitrate = format[1] - format = "-".join(format) - # title = u'%s-%s-%s' % (video_title, size, bitrate) + dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0] + + resolution = format_parts[0] + height = int(resolution[:-len('p')]) + bitrate = int(format_parts[1][:-len('k')]) + format = u'-'.join(format_parts) + u'-' + dn formats.append({ 'url': video_url, - 'ext': extension, 'format': format, 'format_id': format, + 'height': height, + 'tbr': bitrate, + 'resolution': resolution, }) - # Sort and remove doubles - formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) - for i in range(len(formats)-1,0,-1): - if formats[i]['format_id'] == formats[i-1]['format_id']: - del formats[i] + self._sort_formats(formats) + + if not formats: + raise ExtractorError(u'ERROR: no known formats available for video') return { 'id': video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b0e29c2a8..9424d5e26 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -194,6 +194,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40}, '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, |