diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/ard.py | 33 | ||||
-rw-r--r-- | youtube_dl/extractor/comedycentral.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/extremetube.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/fc2.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/ivi.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/mailru.py | 55 | ||||
-rw-r--r-- | youtube_dl/extractor/naver.py | 34 | ||||
-rw-r--r-- | youtube_dl/extractor/nrk.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/tagesschau.py | 79 | ||||
-rw-r--r-- | youtube_dl/extractor/teachertube.py | 85 | ||||
-rw-r--r-- | youtube_dl/extractor/teachingchannel.py | 33 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 25 | ||||
-rw-r--r-- | youtube_dl/extractor/vevo.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/xvideos.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/yahoo.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 9 |
17 files changed, 327 insertions, 91 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b689dc3c9..72523c54d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -267,6 +267,12 @@ from .streamcz import StreamCZIE from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .teachertube import ( + TeacherTubeIE, + TeacherTubeClassroomIE, +) +from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index b88f71bc4..c6d22c029 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -38,15 +38,19 @@ class ARDIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title') + [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', + r'<meta name="dcterms.title" content="(.*?)"/>', + r'<h4 class="headline">(.*?)</h4>'], + webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - streams = [ - mo.groupdict() - for mo in re.finditer( - r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)] + + media_info = self._download_json( + 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) + # The second element of the _mediaArray contains the standard http urls + streams = media_info['_mediaArray'][1]['_mediaStreamArray'] if not streams: if '"fsk"' in webpage: raise ExtractorError('This video is only available after 20:00') @@ -54,21 +58,12 @@ class ARDIE(InfoExtractor): formats = [] for s in streams: format = { - 'quality': int(s['quality']), + 'quality': s['_quality'], + 'url': s['_stream'], } - if s.get('rtmp_url'): - format['protocol'] = 'rtmp' - format['url'] = s['rtmp_url'] - format['playpath'] = s['video_url'] - else: - format['url'] = s['video_url'] - - quality_name = self._search_regex( - r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'], - 'quality name', default='NA') - format['format_id'] = '%s-%s-%s-%s' % ( - determine_ext(format['url']), quality_name, s['media_type'], - s['quality']) + + format['format_id'] = '%s-%s' % ( + determine_ext(format['url']), format['quality']) formats.append(format) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 6e3a316c6..ba4d73ab8 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -188,7 +188,7 @@ class ComedyCentralShowsIE(InfoExtractor): }) formats.append({ 'format_id': 'rtmp-%s' % format, - 'url': rtmp_video_url, + 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), 'ext': self._video_extensions.get(format, 'mp4'), 'height': h, 'width': w, diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index ff7c0cd3e..14a196ffc 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -37,7 +37,7 @@ class ExtremeTubeIE(InfoExtractor): webpage = self._download_webpage(req, video_id) video_title = self._html_search_regex( - r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, 'title') + r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader', fatal=False) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index ca8993241..18f91efac 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -13,7 +13,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?P<lang>[^/]+)/content/(?P<id>[^/]+)' + _VALID_URL = r'^http://video\.fc2\.com/((?P<lang>[^/]+)/)?content/(?P<id>[^/]+)' IE_NAME = 'fc2' _TEST = { 'url': 'http://video.fc2.com/en/content/20121103kUan1KHs', @@ -36,7 +36,7 @@ class FC2IE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) refer = url.replace('/content/', '/a/content/') - mimi = hashlib.md5(video_id + '_gGddgPfeaf_gzyr').hexdigest() + mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() info_url = ( "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&". diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 1ba4966c7..528be1524 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -33,14 +33,14 @@ class IviIE(InfoExtractor): }, # Serial's serie { - 'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791', - 'md5': '3e6cc9a848c1d2ebcc6476444967baa9', + 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549', + 'md5': '221f56b35e3ed815fde2df71032f4b3e', 'info_dict': { - 'id': '74791', + 'id': '9549', 'ext': 'mp4', - 'title': 'Дежурный ангел - 1 серия', - 'duration': 2490, - 'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', + 'title': 'Двое из ларца - Серия 1', + 'duration': 2655, + 'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg', }, 'skip': 'Only works from Russia', } diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 5016989cc..7460d81cd 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -9,29 +9,48 @@ from .common import InfoExtractor class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)' + _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' - _TEST = { - 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', - 'md5': 'dea205f03120046894db4ebb6159879a', - 'info_dict': { - 'id': '46301138', - 'ext': 'mp4', - 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', - 'timestamp': 1393232740, - 'upload_date': '20140224', - 'uploader': 'sonypicturesrus', - 'uploader_id': 'sonypicturesrus@mail.ru', - 'duration': 184, - } - } + _TESTS = [ + { + 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', + 'md5': 'dea205f03120046894db4ebb6159879a', + 'info_dict': { + 'id': '46301138', + 'ext': 'mp4', + 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', + 'timestamp': 1393232740, + 'upload_date': '20140224', + 'uploader': 'sonypicturesrus', + 'uploader_id': 'sonypicturesrus@mail.ru', + 'duration': 184, + }, + }, + { + 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', + 'md5': '00a91a58c3402204dcced523777b475f', + 'info_dict': { + 'id': '46843144', + 'ext': 'mp4', + 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', + 'timestamp': 1397217632, + 'upload_date': '20140411', + 'uploader': 'hitech', + 'uploader_id': 'hitech@corp.mail.ru', + 'duration': 245, + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('idv1') + + if not video_id: + video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') video_data = self._download_json( - 'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') + 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') author = video_data['author'] uploader = author['name'] @@ -40,6 +59,8 @@ class MailRuIE(InfoExtractor): movie = video_data['movie'] content_id = str(movie['contentId']) title = movie['title'] + if title.endswith('.mp4'): + title = title[:-4] thumbnail = movie['poster'] duration = movie['duration'] diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 4cab30631..c0231c197 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -12,12 +14,13 @@ class NaverIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TEST = { - u'url': u'http://tvcast.naver.com/v/81652', - u'file': u'81652.mp4', - u'info_dict': { - u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', - u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', - u'upload_date': u'20130903', + 'url': 'http://tvcast.naver.com/v/81652', + 'info_dict': { + 'id': '81652', + 'ext': 'mp4', + 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + 'upload_date': '20130903', }, } @@ -28,7 +31,7 @@ class NaverIE(InfoExtractor): m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', webpage) if m_id is None: - raise ExtractorError(u'couldn\'t extract vid and key') + raise ExtractorError('couldn\'t extract vid and key') vid = m_id.group(1) key = m_id.group(2) query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,}) @@ -39,22 +42,27 @@ class NaverIE(InfoExtractor): }) info = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, - video_id, u'Downloading video info') + video_id, 'Downloading video info') urls = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, - video_id, u'Downloading video formats info') + video_id, 'Downloading video formats info') formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): domain = format_el.find('Domain').text - if domain.startswith('rtmp'): - continue - formats.append({ + f = { 'url': domain + format_el.find('uri').text, 'ext': 'mp4', 'width': int(format_el.find('width').text), 'height': int(format_el.find('height').text), - }) + } + if domain.startswith('rtmp'): + f.update({ + 'ext': 'flv', + 'rtmp_protocol': '1', # rtmpt + }) + formats.append(f) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f5117d7b3..3a6a7883e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,7 +72,7 @@ class NRKIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'http://tv\.nrk\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})' + _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})' _TESTS = [ { diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py new file mode 100644 index 000000000..36331529e --- /dev/null +++ b/youtube_dl/extractor/tagesschau.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class TagesschauIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', + 'md5': 'bcdeac2194fb296d599ce7929dfa4009', + 'info_dict': { + 'id': '1399128', + 'ext': 'mp4', + 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen', + 'description': 'md5:69da3c61275b426426d711bde96463ab', + 'thumbnail': 're:^http:.*\.jpg$', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/video/video-196.html', + 'md5': '8aaa8bf3ae1ca2652309718c03019128', + 'info_dict': { + 'id': '196', + 'ext': 'mp4', + 'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt', + 'description': 'md5:f22e4af75821d174fa6c977349682691', + 'thumbnail': 're:http://.*\.jpg', + }, + }] + + _FORMATS = { + 's': {'width': 256, 'height': 144, 'quality': 1}, + 'm': {'width': 512, 'height': 288, 'quality': 2}, + 'l': {'width': 960, 'height': 544, 'quality': 3}, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + if video_id.startswith('-'): + display_id = video_id.strip('-') + else: + display_id = video_id + + webpage = self._download_webpage(url, display_id) + + playerpage = self._download_webpage( + 'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id, + display_id, 'Downloading player page') + + medias = re.findall( + r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', + playerpage) + + formats = [] + for url, ext, res in medias: + f = { + 'format_id': res + '_' + ext, + 'url': url, + 'ext': ext, + } + f.update(self._FORMATS.get(res, {})) + formats.append(f) + + self._sort_formats(formats) + + thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1] + + return { + 'id': display_id, + 'title': self._og_search_title(webpage).strip(), + 'thumbnail': 'http://www.tagesschau.de' + thumbnail, + 'formats': formats, + 'description': self._og_search_description(webpage).strip(), + } diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py new file mode 100644 index 000000000..4740f3d56 --- /dev/null +++ b/youtube_dl/extractor/teachertube.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class TeacherTubeIE(InfoExtractor): + IE_NAME = 'teachertube' + IE_DESC = 'teachertube.com videos' + + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/viewVideo\.php\?video_id=(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', + 'md5': 'f9434ef992fd65936d72999951ee254c', + 'info_dict': { + 'id': '339997', + 'ext': 'mp4', + 'title': 'Measures of dispersion from a frequency table_x264', + 'description': 'md5:a3e9853487185e9fcd7181a07164650b', + 'thumbnail': 're:http://.*\.jpg', + }, + }, { + 'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064', + 'md5': '0d625ec6bc9bf50f70170942ad580676', + 'info_dict': { + 'id': '340064', + 'ext': 'mp4', + 'title': 'How to Make Paper Dolls _ Paper Art Projects', + 'description': 'md5:2ca52b20cd727773d1dc418b3d6bd07b', + 'thumbnail': 're:http://.*\.jpg', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + url = self._html_search_meta('twitter:player:stream', webpage, 'twitter player') + + formats = [{ + 'format_id': 'flv', + 'url': url.replace('mp4v', 'flv').replace('.mp4', '.flv'), + 'quality': 0, + 'ext': 'flv', + }, { + 'format_id': 'mp4', + 'url': url, + 'quality': 1, + 'ext': 'mp4', + }] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + } + + +class TeacherTubeClassroomIE(InfoExtractor): + IE_NAME = 'teachertube:classroom' + IE_DESC = 'teachertube.com online classrooms' + + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user') + + rss = self._download_xml('http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id, + user_id, 'Downloading classroom RSS') + + entries = [] + for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'): + entries.append(self.url_result(url.attrib['url'], 'TeacherTube')) + + return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py new file mode 100644 index 000000000..117afa9bf --- /dev/null +++ b/youtube_dl/extractor/teachingchannel.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TeachingChannelIE(InfoExtractor): + _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P<title>.+)' + + _TEST = { + 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', + 'info_dict': { + 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', + 'ext': 'mp4', + 'title': 'A History of Teaming', + 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + ooyala_code = self._search_regex( + r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code') + + return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index f15780ef5..b6b2dba9c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -18,17 +20,17 @@ class ThePlatformIE(InfoExtractor): _TEST = { # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ - u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', - u'info_dict': { - u'id': u'e9I_cZgTgIPd', - u'ext': u'flv', - u'title': u'Blackberry\'s big, bold Z30', - u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', - u'duration': 247, + 'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', + 'info_dict': { + 'id': 'e9I_cZgTgIPd', + 'ext': 'flv', + 'title': 'Blackberry\'s big, bold Z30', + 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', + 'duration': 247, }, - u'params': { + 'params': { # rtmp download - u'skip_download': True, + 'skip_download': True, }, } @@ -39,7 +41,7 @@ class ThePlatformIE(InfoExtractor): error_msg = next( n.attrib['abstract'] for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == u'Geographic Restriction') + if n.attrib.get('title') == 'Geographic Restriction') except StopIteration: pass else: @@ -101,8 +103,7 @@ class ThePlatformIE(InfoExtractor): config_url = url+ '&form=json' config_url = config_url.replace('swf/', 'config/') config_url = config_url.replace('onsite/', 'onsite/config/') - config_json = self._download_webpage(config_url, video_id, u'Downloading config') - config = json.loads(config_json) + config = self._download_json(config_url, video_id, 'Downloading config') smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' else: smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index ea34a8f16..eada13ce9 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -16,7 +16,7 @@ class VevoIE(InfoExtractor): (currently used by MTVIE) """ _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?| + (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 85e99e1b0..7e0044824 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,18 +5,21 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, + ExtractorError, + clean_html, ) class XVideosIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' _TEST = { - 'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1', - 'file': '939581.flv', - 'md5': '1d0c835822f0a71a7bf011855db929d0', + 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', + 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', 'info_dict': { - "title": "Funny Porns By >>>>S<<<<<< -1", - "age_limit": 18, + 'id': '4588838', + 'ext': 'flv', + 'title': 'Biker Takes his Girl', + 'age_limit': 18, } } @@ -28,6 +31,10 @@ class XVideosIE(InfoExtractor): self.report_extraction(video_id) + mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) + if mobj: + raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) + # Extract video URL video_url = compat_urllib_parse.unquote( self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 393f6ffbe..d84be2562 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -21,7 +21,7 @@ class YahooIE(InfoExtractor): 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'md5': '4962b075c08be8690a922ee026d05e69', 'info_dict': { - 'id': '214727115', + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', 'ext': 'mp4', 'title': 'Julian Smith & Travis Legg Watch Julian Smith', 'description': 'Julian and Travis watch Julian Smith', @@ -31,7 +31,7 @@ class YahooIE(InfoExtractor): 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', 'md5': 'd6e6fc6e1313c608f316ddad7b82b306', 'info_dict': { - 'id': '103000935', + 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', 'ext': 'mp4', 'title': 'Codefellas - The Cougar Lies with Spanish Moss', 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', @@ -58,9 +58,11 @@ class YahooIE(InfoExtractor): r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) if items_json is None: - long_id = self._search_regex( + CONTENT_ID_REGEXES = [ r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', - webpage, 'content ID') + r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"' + ] + long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID') video_id = long_id else: items = json.loads(items_json) @@ -68,9 +70,9 @@ class YahooIE(InfoExtractor): # The 'meta' field is not always in the video webpage, we request it # from another page long_id = info['id'] - return self._get_info(long_id, video_id) + return self._get_info(long_id, video_id, webpage) - def _get_info(self, long_id, video_id): + def _get_info(self, long_id, video_id, webpage): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' ' AND protocol="http"' % long_id) @@ -113,7 +115,7 @@ class YahooIE(InfoExtractor): 'title': meta['title'], 'formats': formats, 'description': clean_html(meta['description']), - 'thumbnail': meta['thumbnail'], + 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), } @@ -137,7 +139,7 @@ class YahooNewsIE(YahooIE): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id') - return self._get_info(long_id, video_id) + return self._get_info(long_id, video_id, webpage) class YahooSearchIE(SearchInfoExtractor): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 981ca62c0..7c50881c4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -223,6 +223,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, @@ -1140,7 +1141,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage) if mobj is None: mobj = re.search( - r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded) on (.*?)</strong>', + r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>', video_webpage) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) @@ -1414,11 +1415,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): title_span = (search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) - video_re = r'''(?x)data-video-username="(.*?)".*? + video_re = r'''(?x)data-video-username=".*?".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) - matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) - # Some of the videos may have been deleted, their username field is empty - ids = [video_id for (username, video_id) in matches if username] + ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) |