diff options
Diffstat (limited to 'youtube_dl/extractor')
29 files changed, 245 insertions, 152 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3f85c99cd..3c1807f15 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -421,6 +421,7 @@ from .vesti import VestiIE from .vevo import VevoIE from .vgtv import VGTVIE from .vh1 import VH1IE +from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index fcf296057..11f149f9e 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -3,12 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_HTTPError, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, - +) +from ..utils import ( ExtractorError, ) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ad22cbafd..a6920685e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -14,6 +14,7 @@ from ..utils import ( compat_str, compat_urllib_request, compat_parse_qs, + compat_urllib_parse_urlparse, determine_ext, ExtractorError, @@ -23,7 +24,7 @@ from ..utils import ( class BrightcoveIE(InfoExtractor): - _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)' + _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*?\?(?P<query>.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _TESTS = [ @@ -260,11 +261,19 @@ class BrightcoveIE(InfoExtractor): formats = [] for rend in renditions: url = rend['defaultURL'] + if not url: + continue if rend['remote']: - # This type of renditions are served through akamaihd.net, - # but they don't use f4m manifests - url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' - ext = 'flv' + url_comp = compat_urllib_parse_urlparse(url) + if url_comp.path.endswith('.m3u8'): + formats.extend( + self._extract_m3u8_formats(url, info['id'], 'mp4')) + continue + elif 'akamaihd.net' in url_comp.netloc: + # This type of renditions are served through + # akamaihd.net, but they don't use f4m manifests + url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' + ext = 'flv' else: ext = determine_ext(url) size = rend.get('size') diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 4f000292b..16d800512 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -27,7 +27,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -43,7 +43,7 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 'authors': [ 'Mike Wilmot' ], }, } @@ -94,7 +94,7 @@ class Channel9IE(InfoExtractor): def _extract_title(self, html): title = self._html_search_meta('title', html, 'title') - if title is None: + if title is None: title = self._og_search_title(html) TITLE_SUFFIX = ' (Channel 9)' if title is not None and title.endswith(TITLE_SUFFIX): @@ -115,7 +115,7 @@ class Channel9IE(InfoExtractor): return self._html_search_meta('description', html, 'description') def _extract_duration(self, html): - m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) + m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None def _extract_slides(self, html): @@ -167,7 +167,7 @@ class Channel9IE(InfoExtractor): return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) def _extract_content(self, html, content_path): - # Look for downloadable content + # Look for downloadable content formats = self._formats_from_html(html) slides = self._extract_slides(html) zip_ = self._extract_zip(html) @@ -258,16 +258,17 @@ class Channel9IE(InfoExtractor): webpage = self._download_webpage(url, content_path, 'Downloading web page') - page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage) - if page_type_m is None: - raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True) - - page_type = page_type_m.group('pagetype') - if page_type == 'List': # List page, may contain list of 'item'-like objects + page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage) + if page_type_m is not None: + page_type = page_type_m.group('pagetype') + if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content + return self._extract_entry_item(webpage, content_path) + elif page_type == 'Session': # Event session page, may contain downloadable content + return self._extract_session(webpage, content_path) + elif page_type == 'Event': + return self._extract_list(content_path) + else: + raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) + + else: # Assuming list return self._extract_list(content_path) - elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content - return self._extract_entry_item(webpage, content_path) - elif page_type == 'Session': # Event session page, may contain downloadable content - return self._extract_session(webpage, content_path) - else: - raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index d064a28f9..31fe906b4 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,11 +42,12 @@ class CinemassacreIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage) if not mobj: raise ExtractorError('Can\'t extract embed url and video id') playerdata_url = mobj.group('embed_url') video_id = mobj.group('video_id') + full_video_id = mobj.group('full_video_id') video_title = self._html_search_regex( r'<title>(?P<title>.+?)\|', webpage, 'title') @@ -59,41 +60,53 @@ class CinemassacreIE(InfoExtractor): vidurl = self._search_regex( r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') - vidid = self._search_regex( - r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') - videoserver = self._html_search_regex( - r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') - videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) - videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') + videolist_url = None - formats = [] - baseurl = vidurl[:vidurl.rfind('/')+1] - for video in videolist.findall('.//video'): - src = video.get('src') - if not src: - continue - file_ = src.partition(':')[-1] - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - bitrate = int_or_none(video.get('system-bitrate')) - format = { - 'url': baseurl + file_, - 'format_id': src.rpartition('.')[0].rpartition('_')[-1], - } - if width or height: - format.update({ - 'tbr': bitrate // 1000 if bitrate else None, - 'width': width, - 'height': height, - }) - else: - format.update({ - 'abr': bitrate // 1000 if bitrate else None, - 'vcodec': 'none', - }) - formats.append(format) - self._sort_formats(formats) + mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata) + if mobj: + videoserver = mobj.group('videoserver') + mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata) + vidid = mobj.group('vidid') if mobj else full_video_id + videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) + else: + mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata) + if mobj: + videolist_url = mobj.group('smil') + + if videolist_url: + videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') + formats = [] + baseurl = vidurl[:vidurl.rfind('/')+1] + for video in videolist.findall('.//video'): + src = video.get('src') + if not src: + continue + file_ = src.partition(':')[-1] + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + bitrate = int_or_none(video.get('system-bitrate')) + format = { + 'url': baseurl + file_, + 'format_id': src.rpartition('.')[0].rpartition('_')[-1], + } + if width or height: + format.update({ + 'tbr': bitrate // 1000 if bitrate else None, + 'width': width, + 'height': height, + }) + else: + format.update({ + 'abr': bitrate // 1000 if bitrate else None, + 'vcodec': 'none', + }) + formats.append(format) + self._sort_formats(formats) + else: + formats = [{ + 'url': vidurl, + }] return { 'id': video_id, diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 386f080d2..abf8cc280 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -4,14 +4,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, +from ..compat import ( compat_parse_qs, compat_urllib_parse, - remove_end, - HEADRequest, compat_HTTPError, ) +from ..utils import ( + ExtractorError, + HEADRequest, + remove_end, +) class CloudyIE(InfoExtractor): diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 78877b1cf..3826ce7e1 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -16,9 +16,10 @@ class CNNIE(InfoExtractor): _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', - 'file': 'sports_2013_06_09_nadal-1-on-1.cnn.mp4', 'md5': '3e6121ea48df7e2259fe73a0628605c4', 'info_dict': { + 'id': 'sports_2013_06_09_nadal-1-on-1.cnn', + 'ext': 'mp4', 'title': 'Nadal wins 8th French Open title', 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', 'duration': 135, @@ -27,9 +28,10 @@ class CNNIE(InfoExtractor): }, { "url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", - "file": "us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", "md5": "b5cc60c60a3477d185af8f19a2a26f4e", "info_dict": { + 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'ext': 'mp4', "title": "Student's epic speech stuns new freshmen", "description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", "upload_date": "20130821", diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7e4113213..b77f0e519 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -12,13 +12,14 @@ import sys import time import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, compat_str, - +) +from ..utils import ( clean_html, compiled_regex_type, ExtractorError, @@ -403,7 +404,7 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -424,8 +425,11 @@ class InfoExtractor(object): _name = name if mobj: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + else: + return mobj.group(group) elif default is not _NO_DEFAULT: return default elif fatal: @@ -435,11 +439,11 @@ class InfoExtractor(object): 'please report this issue on http://yt-dl.org/bug' % _name) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ - res = self._search_regex(pattern, string, name, default, fatal, flags) + res = self._search_regex(pattern, string, name, default, fatal, flags, group) if res: return clean_html(res).strip() else: @@ -533,9 +537,9 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)<meta - (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) - [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=fatal, **kwargs) + (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name), + html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index cc612d08e..0bd0eccba 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -17,7 +17,6 @@ from ..utils import ( bytes_to_intlist, intlist_to_bytes, unified_strdate, - clean_html, urlencode_postdata, ) from ..aes import ( diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 5f24ac721..aefca848a 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,7 +5,8 @@ import os.path import re from .common import InfoExtractor -from ..utils import compat_urllib_parse_unquote, url_basename +from ..compat import compat_urllib_parse_unquote +from ..utils import url_basename class DropboxIE(InfoExtractor): diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 3ad993751..104803563 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -5,12 +5,14 @@ import re import socket from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_http_client, compat_str, compat_urllib_error, compat_urllib_parse, compat_urllib_request, +) +from ..utils import ( urlencode_postdata, ExtractorError, limit_length, diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 3d67b9d60..d570e3f6a 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -8,12 +8,11 @@ from ..utils import ( compat_urllib_parse, compat_urlparse, unescapeHTML, - get_meta_content, ) class GameSpotIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' + _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TEST = { 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -26,10 +25,10 @@ class GameSpotIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('page_id') + page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video') + data_video_json = self._search_regex( + r'data-video=["\'](.*?)["\']', webpage, 'data video') data_video = json.loads(unescapeHTML(data_video_json)) # Transform the manifest url to a link to the mp4 files @@ -41,7 +40,8 @@ class GameSpotIE(InfoExtractor): http_path = f4m_path[1:].split('/', 1)[1] http_template = re.sub(QUALITIES_RE, r'%s', http_path) http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template) + http_template = compat_urlparse.urljoin( + 'http://video.gamespotcdn.com/', http_template) formats = [] for q in qualities: formats.append({ @@ -52,8 +52,9 @@ class GameSpotIE(InfoExtractor): return { 'id': data_video['guid'], + 'display_id': page_id, 'title': compat_urllib_parse.unquote(data_video['title']), 'formats': formats, - 'description': get_meta_content('description', webpage), + 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8abc340b4..01d6a57f8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,11 +7,12 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..utils import ( +from ..compat import ( compat_urllib_parse, compat_urlparse, compat_xml_parse_error, - +) +from ..utils import ( determine_ext, ExtractorError, float_or_none, @@ -99,6 +100,22 @@ class GenericIE(InfoExtractor): 'uploader': 'Championat', }, }, + { + # https://github.com/rg3/youtube-dl/issues/3541 + 'add_ie': ['Brightcove'], + 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', + 'info_dict': { + 'id': '3866516442001', + 'ext': 'mp4', + 'title': 'Leer mij vrouwen kennen: Aflevering 1', + 'description': 'Leer mij vrouwen kennen: Aflevering 1', + 'uploader': 'SBS Broadcasting', + }, + 'skip': 'Restricted to Netherlands', + 'params': { + 'skip_download': True, # m3u8 download + }, + }, # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 77c3ad4fc..66ca37918 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -5,13 +5,15 @@ import random import math from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, +from ..compat import ( compat_str, compat_chr, compat_ord, ) +from ..utils import ( + ExtractorError, + float_or_none, +) class GloboIE(InfoExtractor): diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py index 7bca21ad0..18474cbb7 100644 --- a/youtube_dl/extractor/goshgay.py +++ b/youtube_dl/extractor/goshgay.py @@ -1,15 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( compat_urlparse, - str_to_int, ExtractorError, ) -import json class GoshgayIE(InfoExtractor): @@ -27,36 +23,27 @@ class GoshgayIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title') + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', webpage, default='false') + config_url = self._search_regex( + r"'config'\s*:\s*'([^']+)'", webpage, 'config URL') - player_config = self._search_regex( - r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings') - player_vars = json.loads(player_config.replace("'", '"')) - width = str_to_int(player_vars.get('width')) - height = str_to_int(player_vars.get('height')) - config_uri = player_vars.get('config') + config = self._download_xml( + config_url, video_id, 'Downloading player config XML') - if config_uri is None: - raise ExtractorError('Missing config URI') - node = self._download_xml(config_uri, video_id, 'Downloading player config XML', - errnote='Unable to download XML') - if node is None: + if config is None: raise ExtractorError('Missing config XML') - if node.tag != 'config': + if config.tag != 'config': raise ExtractorError('Missing config attribute') - fns = node.findall('file') - imgs = node.findall('image') - if len(fns) != 1: + fns = config.findall('file') + if len(fns) < 1: raise ExtractorError('Missing media URI') video_url = fns[0].text - if len(imgs) < 1: - thumbnail = None - else: - thumbnail = imgs[0].text url_comp = compat_urlparse.urlparse(url) ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2]) @@ -65,9 +52,7 @@ class GoshgayIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': title, - 'width': width, - 'height': height, 'thumbnail': thumbnail, 'http_referer': ref, - 'age_limit': 18, + 'age_limit': 0 if family_friendly == 'true' else 18, } diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index d41c0413f..278d9f527 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - get_meta_content, + determine_ext, int_or_none, parse_iso8601, ) @@ -25,11 +25,11 @@ class HeiseIE(InfoExtractor): 'title': ( "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" ), - 'format_id': 'mp4_720', + 'format_id': 'mp4_720p', 'timestamp': 1411812600, 'upload_date': '20140927', 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': 're:^https?://.*\.jpe?g$', } } @@ -49,11 +49,12 @@ class HeiseIE(InfoExtractor): info = { 'id': video_id, 'thumbnail': self._og_search_thumbnail(webpage), - 'timestamp': parse_iso8601(get_meta_content('date', webpage)), + 'timestamp': parse_iso8601( + self._html_search_meta('date', webpage)), 'description': self._og_search_description(webpage), } - title = get_meta_content('fulltitle', webpage) + title = self._html_search_meta('fulltitle', webpage) if title: info['title'] = title else: @@ -64,9 +65,12 @@ class HeiseIE(InfoExtractor): label = source_node.attrib['label'] height = int_or_none(self._search_regex( r'^(.*?_)?([0-9]+)p$', label, 'height', default=None)) + video_url = source_node.attrib['file'] + ext = determine_ext(video_url, '') formats.append({ - 'url': source_node.attrib['file'], + 'url': video_url, 'format_note': label, + 'format_id': '%s_%s' % (ext, label), 'height': height, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4536db3bf..6108ed552 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -6,7 +6,6 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, - get_element_by_attribute, ) @@ -27,10 +26,11 @@ class ImdbIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) - descr = get_element_by_attribute('itemprop', 'description', webpage) + descr = self._html_search_regex( + r'(?s)<span itemprop="description">(.*?)</span>', + webpage, 'description', fatal=False) available_formats = re.findall( r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, flags=re.MULTILINE) @@ -73,9 +73,7 @@ class ImdbListIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - list_id = mobj.group('id') - + list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 07ef682ee..d16d483ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -5,11 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - get_element_by_id, - parse_iso8601, determine_ext, - int_or_none, float_or_none, + get_element_by_id, + int_or_none, + parse_iso8601, str_to_int, ) @@ -30,7 +30,7 @@ class IzleseneIE(InfoExtractor): 'description': 'md5:253753e2655dde93f59f74b572454f6d', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'pelikzzle', - 'timestamp': 1404298698, + 'timestamp': 1404302298, 'upload_date': '20140702', 'duration': 95.395, 'age_limit': 0, @@ -46,7 +46,7 @@ class IzleseneIE(InfoExtractor): 'description': 'Tarkan Dortmund 2006 Konseri', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'parlayankiz', - 'timestamp': 1163318593, + 'timestamp': 1163322193, 'upload_date': '20061112', 'duration': 253.666, 'age_limit': 0, @@ -55,10 +55,9 @@ class IzleseneIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.izlesene.com/video/%s' % video_id + video_id = self._match_id(url) + url = 'http://www.izlesene.com/video/%s' % video_id webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 263f68773..102e29f7a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -4,6 +4,7 @@ import random import re from .common import InfoExtractor +from ..utils import ExtractorError class Laola1TvIE(InfoExtractor): diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index ccb5959c4..a89153985 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -7,11 +7,12 @@ import re import json from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_ord, compat_urllib_parse, compat_urllib_request, - +) +from ..utils import ( ExtractorError, ) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 62d5707fe..45cbd4ee9 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -12,6 +12,7 @@ from ..utils import ( unified_strdate, parse_duration, int_or_none, + ExtractorError, ) @@ -108,6 +109,9 @@ class NiconicoIE(InfoExtractor): flv_info_request, video_id, note='Downloading flv info', errnote='Unable to download flv info') + if 'deleted=' in flv_info_webpage: + raise ExtractorError('The video has been deleted.', + expected=True) video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 645a1e06d..17880471d 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -6,6 +6,7 @@ import os.path from .common import InfoExtractor from ..utils import ( + ExtractorError, compat_urllib_parse, compat_urllib_request, ) @@ -29,6 +30,12 @@ class PlayedIE(InfoExtractor): video_id = self._match_id(url) orig_webpage = self._download_webpage(url, video_id) + + m_error = re.search( + r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage) + if m_error: + raise ExtractorError(m_error.group('msg'), expected=True) + fields = re.findall( r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) data = dict(fields) diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index 0a3a71448..962b524e9 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import compat_urllib_parse_unquote +from ..compat import compat_urllib_parse_unquote class Ro220IE(InfoExtractor): diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py index a73f3c43a..e7b79243a 100644 --- a/youtube_dl/extractor/trutube.py +++ b/youtube_dl/extractor/trutube.py @@ -29,7 +29,7 @@ class TruTubeIE(InfoExtractor): # filehd is always 404 video_url = xpath_text(config, './file', 'video URL', fatal=True) - title = xpath_text(config, './title', 'title') + title = xpath_text(config, './title', 'title').strip() thumbnail = xpath_text(config, './image', ' thumbnail') return { diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index cee1ea8f6..875450908 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( compat_urlparse, - get_meta_content, ) @@ -79,7 +78,7 @@ class UstreamChannelIE(InfoExtractor): m = re.match(self._VALID_URL, url) display_id = m.group('slug') webpage = self._download_webpage(url, display_id) - channel_id = get_meta_content('ustream:channel_id', webpage) + channel_id = self._html_search_meta('ustream:channel_id', webpage) BASE = 'http://www.ustream.tv' next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py new file mode 100644 index 000000000..f11ca8217 --- /dev/null +++ b/youtube_dl/extractor/vice.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ExtractorError + + +class ViceIE(InfoExtractor): + _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)' + + _TEST = { + 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', + 'info_dict': { + 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'ext': 'mp4', + 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + }, + 'params': { + # Requires ffmpeg (m3u8 manifest) + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + try: + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', webpage, + 'ooyala embed code') + ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + print(ooyala_url) + except ExtractorError: + raise ExtractorError('The page doesn\'t contain a video', expected=True) + return self.url_result(ooyala_url, ie='Ooyala') + diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d9cad0ea5..c744d4f04 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -7,11 +7,13 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor -from ..utils import ( +from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urllib_request, compat_urlparse, +) +from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 3377a543e..d6dec25ca 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -37,7 +37,7 @@ class WimpIE(InfoExtractor): video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r"'file'\s*:\s*'([^']+)'", webpage, 'video URL') + r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL') if YoutubeIE.suitable(video_url): self.to_screen('Found YouTube video') return { diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index aad8ffbf4..c77d4056f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -684,7 +684,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video info self.report_video_info_webpage_download(video_id) if re.search(r'player-age-gate-content">', video_webpage) is not None: - self.report_age_confirmation() age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube @@ -692,12 +691,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'), + r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) else: age_gate = False |