diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/arte.py | 114 | ||||
-rw-r--r-- | youtube_dl/extractor/bliptv.py | 14 | ||||
-rw-r--r-- | youtube_dl/extractor/brightcove.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/comedycentral.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 44 | ||||
-rw-r--r-- | youtube_dl/extractor/folketinget.py | 75 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/goldenmoustache.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/mtv.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/rtlnl.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/stanfordoc.py | 88 | ||||
-rw-r--r-- | youtube_dl/extractor/sztvhu.py | 24 | ||||
-rw-r--r-- | youtube_dl/extractor/telebruxelles.py | 60 | ||||
-rw-r--r-- | youtube_dl/extractor/vh1.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 20 |
16 files changed, 320 insertions, 152 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b687a56b4..fcb75af34 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .fktv import ( FKTVPosteckeIE, ) from .flickr import FlickrIE +from .folketinget import FolketingetIE from .fourtube import FourTubeIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE @@ -379,6 +380,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telemb import TeleMBIE from .tenplay import TenPlayIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b9a9440c0..3a57ce527 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -5,13 +5,12 @@ import re from .common import InfoExtractor from ..utils import ( - ExtractorError, find_xpath_attr, unified_strdate, - determine_ext, get_element_by_id, get_element_by_attribute, int_or_none, + qualities, ) # There are different sources of video in arte.tv, the extraction process @@ -102,79 +101,54 @@ class ArteTVPlus7IE(InfoExtractor): 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } + qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) - all_formats = [] + formats = [] for format_id, format_dict in player_info['VSR'].items(): - fmt = dict(format_dict) - fmt['format_id'] = format_id - all_formats.append(fmt) - # Some formats use the m3u8 protocol - all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) - def _match_lang(f): - if f.get('versionCode') is None: - return True - # Return true if that format is in the language of the url - if lang == 'fr': - l = 'F' - elif lang == 'de': - l = 'A' - else: - l = lang - regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] - return any(re.match(r, f['versionCode']) for r in regexes) - # Some formats may not be in the same language as the url - # TODO: Might want not to drop videos that does not match requested language - # but to process those formats with lower precedence - formats = filter(_match_lang, all_formats) - formats = list(formats) # in python3 filter returns an iterator - if not formats: - # Some videos are only available in the 'Originalversion' - # they aren't tagged as being in French or German - # Sometimes there are neither videos of requested lang code - # nor original version videos available - # For such cases we just take all_formats as is - formats = all_formats - if not formats: - raise ExtractorError('The formats list is empty') - - if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: - def sort_key(f): - return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) - else: - def sort_key(f): - versionCode = f.get('versionCode') - if versionCode is None: - versionCode = '' - return ( - # Sort first by quality - int(f.get('height', -1)), - int(f.get('bitrate', -1)), - # The original version with subtitles has lower relevance - re.match(r'VO-ST(F|A)', versionCode) is None, - # The version with sourds/mal subtitles has also lower relevance - re.match(r'VO?(F|A)-STM\1', versionCode) is None, - # Prefer http downloads over m3u8 - 0 if f['url'].endswith('m3u8') else 1, - ) - formats = sorted(formats, key=sort_key) - def _format(format_info): - info = { - 'format_id': format_info['format_id'], - 'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')), - 'width': int_or_none(format_info.get('width')), - 'height': int_or_none(format_info.get('height')), - 'tbr': int_or_none(format_info.get('bitrate')), + f = dict(format_dict) + versionCode = f.get('versionCode') + + langcode = { + 'fr': 'F', + 'de': 'A', + }.get(lang, lang) + lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode] + lang_pref = ( + None if versionCode is None else ( + 10 if any(re.match(r, versionCode) for r in lang_rexs) + else -10)) + source_pref = 0 + if versionCode is not None: + # The original version with subtitles has lower relevance + if re.match(r'VO-ST(F|A)', versionCode): + source_pref -= 10 + # The version with sourds/mal subtitles has also lower relevance + elif re.match(r'VO?(F|A)-STM\1', versionCode): + source_pref -= 9 + format = { + 'format_id': format_id, + 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, + 'language_preference': lang_pref, + 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'tbr': int_or_none(f.get('bitrate')), + 'quality': qfunc(f['quality']), + 'source_preference': source_pref, } - if format_info['mediaType'] == 'rtmp': - info['url'] = format_info['streamer'] - info['play_path'] = 'mp4:' + format_info['url'] - info['ext'] = 'flv' + + if f.get('mediaType') == 'rtmp': + format['url'] = f['streamer'] + format['play_path'] = 'mp4:' + f['url'] + format['ext'] = 'flv' else: - info['url'] = format_info['url'] - info['ext'] = determine_ext(info['url']) - return info - info_dict['formats'] = [_format(f) for f in formats] + format['url'] = f['url'] + + formats.append(format) + + self._sort_formats(formats) + info_dict['formats'] = formats return info_dict diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 2370c24b0..f2b02643d 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -166,9 +166,17 @@ class BlipTVIE(SubtitlesInfoExtractor): class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' _PAGE_SIZE = 12 IE_NAME = 'blip.tv:user' + _TEST = { + 'url': 'http://blip.tv/actone', + 'info_dict': { + 'id': 'actone', + 'title': 'Act One: The Series', + }, + 'playlist_count': 5, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -179,6 +187,7 @@ class BlipTVUserIE(InfoExtractor): page = self._download_webpage(url, username, 'Downloading user page') mobj = re.search(r'data-users-id="([^"]+)"', page) page_base = page_base % mobj.group(1) + title = self._og_search_title(page) # Download video ids using BlipTV Ajax calls. Result size per # query is limited (currently to 12 videos) so we need to query @@ -215,4 +224,5 @@ class BlipTVUserIE(InfoExtractor): urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] - return [self.playlist_result(url_entries, playlist_title=username)] + return self.playlist_result( + url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a6920685e..2db7f9fef 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -111,6 +111,8 @@ class BrightcoveIE(InfoExtractor): lambda m: m.group(1) + '/>', object_str) # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') + # remove namespace to simplify extraction + object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) @@ -219,7 +221,7 @@ class BrightcoveIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage, + r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, 'error message', default=None) if error_msg is not None: raise ExtractorError( diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 49b978b4e..2e3ef3fda 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .mtv import MTVServicesInfoExtractor from ..utils import ( compat_str, @@ -110,9 +109,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) + mobj = re.match(self._VALID_URL, url) if mobj.group('shortname'): if mobj.group('shortname') in ('tds', 'thedailyshow'): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b77f0e519..93a5a3d57 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -43,7 +43,11 @@ class InfoExtractor(object): information possibly downloading the video to the file system, among other possible outcomes. - The dictionaries must include the following fields: + The type field determines the the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: id: Video identifier. title: Video title, unescaped. @@ -87,6 +91,11 @@ class InfoExtractor(object): by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + * language_preference Is this in the correct requested + language? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. * quality Order number of the video quality of this format, irrespective of the file format. -1 for default (order by other properties), @@ -146,6 +155,38 @@ class InfoExtractor(object): Unless mentioned otherwise, None is equivalent to absence of information. + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list or a PagedList object, each + element of which is a valid dictionary under this specfication. + + Additionally, playlists can have "title" and "id" attributes with the same + semantics as videos (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + + Additionally, it may have properties believed to be identical to the + resolved entity, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -615,6 +656,7 @@ class InfoExtractor(object): return ( preference, + f.get('language_preference') if f.get('language_preference') is not None else -1, f.get('quality') if f.get('quality') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py new file mode 100644 index 000000000..68e2db943 --- /dev/null +++ b/youtube_dl/extractor/folketinget.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + xpath_text, +) + + +class FolketingetIE(InfoExtractor): + IE_DESC = 'Folketinget (ft.dk; Danish parliament)' + _VALID_URL = r'https?://(?:www\.)?ft\.dk/webtv/video/[^?#]*?\.(?P<id>[0-9]+)\.aspx' + _TEST = { + 'url': 'http://www.ft.dk/webtv/video/20141/eru/td.1165642.aspx?as=1#player', + 'info_dict': { + 'id': '1165642', + 'ext': 'mp4', + 'title': 'Åbent samråd i Erhvervsudvalget', + 'description': 'Åbent samråd med erhvervs- og vækstministeren om regeringens politik på teleområdet', + 'view_count': int, + 'width': 768, + 'height': 432, + 'tbr': 928000, + 'timestamp': 1416493800, + 'upload_date': '20141120', + 'duration': 3960, + }, + 'params': { + 'skip_download': 'rtmpdump required', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._html_search_regex( + r'(?s)<div class="video-item-agenda"[^>]*>(.*?)<', + webpage, 'description', fatal=False) + + player_params = compat_parse_qs(self._search_regex( + r'<embed src="http://ft\.arkena\.tv/flash/ftplayer\.swf\?([^"]+)"', + webpage, 'player params')) + xml_url = player_params['xml'][0] + doc = self._download_xml(xml_url, video_id) + + timestamp = parse_iso8601(xpath_text(doc, './/date')) + duration = parse_duration(xpath_text(doc, './/duration')) + width = int_or_none(xpath_text(doc, './/width')) + height = int_or_none(xpath_text(doc, './/height')) + view_count = int_or_none(xpath_text(doc, './/views')) + + formats = [{ + 'format_id': n.attrib['bitrate'], + 'url': xpath_text(n, './url', fatal=True), + 'tbr': int_or_none(n.attrib['bitrate']), + } for n in doc.findall('.//streams/stream')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'timestamp': timestamp, + 'width': width, + 'height': height, + 'duration': duration, + 'view_count': view_count, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af769ab61..c7a824c29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -979,7 +979,7 @@ class GenericIE(InfoExtractor): found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)) if not found: # HTML5 video - found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src="([^"]+)"', webpage) + found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage) if not found: found = re.search( r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py index 7e13b131b..10001d4d9 100644 --- a/youtube_dl/extractor/goldenmoustache.py +++ b/youtube_dl/extractor/goldenmoustache.py @@ -1,9 +1,7 @@ from __future__ import unicode_literals -import re from .common import InfoExtractor from ..utils import ( - parse_duration, int_or_none, ) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 474bdff7d..5f0f476b6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -145,7 +145,8 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( feed_url + '?' + data, video_id, 'Downloading info', transform_source=fix_xml_ampersands) - return [self._get_video_info(item) for item in idoc.findall('.//item')] + return self.playlist_result( + [self._get_video_info(item) for item in idoc.findall('.//item')]) def _real_extract(self, url): title = url_basename(url) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 0ab1eb69c..4a188e5d4 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -38,10 +38,11 @@ class RtlXlIE(InfoExtractor): progname = info['abstracts'][0]['name'] subtitle = material['title'] or info['episodes'][0]['name'] - videopath = material['videopath'] - f4m_url = 'http://manifest.us.rtl.nl' + videopath + # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) + videopath = material['videopath'].replace('.f4m', '.m3u8') + m3u8_url = 'http://manifest.us.rtl.nl' + videopath - formats = self._extract_f4m_formats(f4m_url, uuid) + formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') video_urlpart = videopath.split('/flash/')[1][:-4] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' @@ -54,9 +55,12 @@ class RtlXlIE(InfoExtractor): { 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), 'format_id': 'pg-hd', + 'quality': 0, } ]) + self._sort_formats(formats) + return { 'id': uuid, 'title': '%s - %s' % (progname, subtitle), diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index 44c52c718..5feb4ff83 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,24 +11,23 @@ from ..utils import ( class StanfordOpenClassroomIE(InfoExtractor): - IE_NAME = u'stanfordoc' - IE_DESC = u'Stanford Open ClassRoom' - _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + IE_NAME = 'stanfordoc' + IE_DESC = 'Stanford Open ClassRoom' + _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' _TEST = { - u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', - u'file': u'PracticalUnix_intro-environment.mp4', - u'md5': u'544a9468546059d4e80d76265b0443b8', - u'info_dict': { - u"title": u"Intro Environment" + 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', + 'md5': '544a9468546059d4e80d76265b0443b8', + 'info_dict': { + 'id': 'PracticalUnix_intro-environment', + 'ext': 'mp4', + 'title': 'Intro Environment', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - if mobj.group('course') and mobj.group('video'): # A specific video + if mobj.group('course') and mobj.group('video'): # A specific video course = mobj.group('course') video = mobj.group('video') info = { @@ -35,7 +36,6 @@ class StanfordOpenClassroomIE(InfoExtractor): 'upload_date': None, } - self.report_extraction(info['id']) baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' xmlUrl = baseUrl + video + '.xml' mdoc = self._download_xml(xmlUrl, info['id']) @@ -43,63 +43,49 @@ class StanfordOpenClassroomIE(InfoExtractor): info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text except IndexError: - raise ExtractorError(u'Invalid metadata XML file') - info['ext'] = info['url'].rpartition('.')[2] - return [info] - elif mobj.group('course'): # A course page + raise ExtractorError('Invalid metadata XML file') + return info + elif mobj.group('course'): # A course page course = mobj.group('course') info = { 'id': course, - 'type': 'playlist', + '_type': 'playlist', 'uploader': None, 'upload_date': None, } - coursepage = self._download_webpage(url, info['id'], - note='Downloading course info page', - errnote='Unable to download course info page') + coursepage = self._download_webpage( + url, info['id'], + note='Downloading course info page', + errnote='Unable to download course info page') - info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) + info['title'] = self._html_search_regex( + r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['description'] = self._html_search_regex('<description>([^<]+)</description>', - coursepage, u'description', fatal=False) + info['description'] = self._html_search_regex( + r'(?s)<description>([^<]+)</description>', + coursepage, 'description', fatal=False) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) - info['list'] = [ - { - 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), - } - for vpage in links] - results = [] - for entry in info['list']: - assert entry['type'] == 'reference' - results += self.extract(entry['url']) - return results - else: # Root page + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info + else: # Root page info = { 'id': 'Stanford OpenClassroom', - 'type': 'playlist', + '_type': 'playlist', 'uploader': None, 'upload_date': None, } + info['title'] = info['id'] rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' rootpage = self._download_webpage(rootURL, info['id'], - errnote=u'Unable to download course info page') - - info['title'] = info['id'] + errnote='Unable to download course info page') links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) - info['list'] = [ - { - 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), - } - for cpage in links] - - results = [] - for entry in info['list']: - assert entry['type'] == 'reference' - results += self.extract(entry['url']) - return results + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index c9359fafb..aa5964acb 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -1,27 +1,24 @@ # -*- coding: utf-8 -*- - -import re +from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext class SztvHuIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' + _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' _TEST = { - u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', - u'file': u'20130909.mp4', - u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', - u'info_dict': { - u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", - u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', + 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', + 'md5': 'a6df607b11fb07d0e9f2ad94613375cb', + 'info_dict': { + 'id': '20130909', + 'ext': 'mp4', + 'title': 'Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren', + 'description': 'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', }, - u'skip': u'Service temporarily disabled as of 2013-11-20' } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_file = self._search_regex( r'file: "...:(.*?)",', webpage, 'video file') @@ -39,7 +36,6 @@ class SztvHuIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': title, - 'ext': determine_ext(video_url), 'description': description, 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py new file mode 100644 index 000000000..a3d05f97d --- /dev/null +++ b/youtube_dl/extractor/telebruxelles.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TeleBruxellesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', + 'md5': '59439e568c9ee42fb77588b2096b214f', + 'info_dict': { + 'id': '11942', + 'display_id': 'auditions-devant-parlement-francken-galant-tres-attendus', + 'ext': 'flv', + 'title': 'Parlement : Francken et Galant répondent aux interpellations de l’opposition', + 'description': 're:Les auditions des ministres se poursuivent*' + }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, + }, { + 'url': 'http://www.telebruxelles.be/sport/basket-brussels-bat-mons-80-74/', + 'md5': '181d3fbdcf20b909309e5aef5c6c6047', + 'info_dict': { + 'id': '10091', + 'display_id': 'basket-brussels-bat-mons-80-74', + 'ext': 'flv', + 'title': 'Basket : le Brussels bat Mons 80-74', + 'description': 're:^Ils l\u2019on fait ! En basket, le B*', + }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + article_id = self._html_search_regex( + r"<article id=\"post-(\d+)\"", webpage, 'article ID') + title = self._html_search_regex( + r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') + description = self._og_search_description(webpage) + + rtmp_url = self._html_search_regex( + r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", + webpage, 'RTMP url') + rtmp_url = rtmp_url.replace("\" + \"", "") + + return { + 'id': article_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'url': rtmp_url, + 'ext': 'flv', + 'rtmp_live': True # if rtmpdump is not called with "--live" argument, the download is blocked and can be completed + } diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py index 2f77e3898..6be3774b7 100644 --- a/youtube_dl/extractor/vh1.py +++ b/youtube_dl/extractor/vh1.py @@ -121,4 +121,7 @@ class VH1IE(MTVIE): idoc = self._download_xml( doc_url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) - return [self._get_video_info(item) for item in idoc.findall('.//item')] + return self.playlist_result( + [self._get_video_info(item) for item in idoc.findall('.//item')], + playlist_id=video_id, + ) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 08f63be96..0cb837afc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -307,6 +307,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, + '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, + '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, + # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, } @@ -401,6 +406,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'format': '141', }, }, + # Controversy video + { + 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', + 'info_dict': { + 'id': 'T4XJQO3qol8', + 'ext': 'mp4', + 'upload_date': '20100909', + 'uploader': 'The Amazing Atheist', + 'uploader_id': 'TheAmazingAtheist', + 'title': 'Burning Everyone\'s Koran', + 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', + } + } ] def __init__(self, *args, **kwargs): @@ -661,7 +679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = self.extract_id(url) # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id pref_cookies = [ c for c in self._downloader.cookiejar if c.domain == '.youtube.com' and c.name == 'PREF'] |