diff options
Diffstat (limited to 'youtube_dl')
38 files changed, 1025 insertions, 194 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 14a1d06ab..e7194f3e3 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -162,6 +162,7 @@ class YoutubeDL(object): default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. + extract_flat: Do not resolve URLs, return the immediate result. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -558,7 +559,12 @@ class YoutubeDL(object): Returns the resolved ie_result. """ - result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system + result_type = ie_result.get('_type', 'video') + + if self.params.get('extract_flat', False): + if result_type in ('url', 'url_transparent'): + return ie_result + if result_type == 'video': self.add_extra_info(ie_result, extra_info) return self.process_video_result(ie_result, download=download) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 429630ce5..5d2137fe5 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -66,6 +66,10 @@ __authors__ = ( 'Naglis Jonaitis', 'Charles Chen', 'Hassaan Ali', + 'Dobrosław Żybort', + 'David Fabijan', + 'Sebastian Haas', + 'Alexander Kirk', ) __license__ = 'Public Domain' diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 917f3450e..9ce97f5fe 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -292,7 +292,7 @@ class FileDownloader(object): def real_download(self, filename, info_dict): """Real download process. Redefine in subclasses.""" - raise NotImplementedError(u'This method must be implemented by sublcasses') + raise NotImplementedError(u'This method must be implemented by subclasses') def _hook_progress(self, status): for ph in self._progress_hooks: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 66c873789..7ad5d9318 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -69,6 +69,7 @@ from .dfb import DFBIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .drtv import DRTVIE +from .dump import DumpIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE @@ -77,6 +78,10 @@ from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .eitb import EitbIE +from .ellentv import ( + EllenTVIE, + EllenTVClipsIE, +) from .elpais import ElPaisIE from .empflix import EmpflixIE from .engadget import EngadgetIE @@ -126,6 +131,7 @@ from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE @@ -146,6 +152,7 @@ from .ivi import ( from .izlesene import IzleseneIE from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .jpopsukitv import JpopsukiIE @@ -177,10 +184,12 @@ from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE from .mixcloud import MixcloudIE from .mlb import MLBIE from .mpora import MporaIE from .mofosex import MofosexIE +from .mojvideo import MojvideoIE from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE @@ -224,9 +233,12 @@ from .nrk import ( from .ntv import NTVIE from .nytimes import NYTimesIE from .nuvid import NuvidIE -from .oe1 import OE1IE from .ooyala import OoyalaIE -from .orf import ORFIE +from .orf import ( + ORFTVthekIE, + ORFOE1IE, + ORFFM4IE, +) from .parliamentliveuk import ParliamentLiveUKIE from .pbs import PBSIE from .photobucket import PhotobucketIE @@ -247,6 +259,7 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE +from .rtlnl import RtlXlIE from .rtlnow import RTLnowIE from .rts import RTSIE from .rtve import RTVEALaCartaIE @@ -383,6 +396,7 @@ from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE from .wrzuta import WrzutaIE from .xbef import XBefIE +from .xboxclips import XboxClipsIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 7e93bc4df..748608826 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,5 +1,7 @@ #coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -13,13 +15,14 @@ class AparatIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' _TEST = { - u'url': u'http://www.aparat.com/v/wP8On', - u'file': u'wP8On.mp4', - u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1', - u'info_dict': { - u"title": u"تیم گلکسی 11 - زومیت", + 'url': 'http://www.aparat.com/v/wP8On', + 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', + 'info_dict': { + 'id': 'wP8On', + 'ext': 'mp4', + 'title': 'تیم گلکسی 11 - زومیت', }, - #u'skip': u'Extremely unreliable', + # 'skip': 'Extremely unreliable', } def _real_extract(self, url): @@ -29,8 +32,8 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' + - video_id + u'/vt/frame') + embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + + video_id + '/vt/frame') webpage = self._download_webpage(embed_url, video_id) video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index dc8657b67..4359b88d1 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -6,6 +6,7 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, + int_or_none, ) @@ -110,8 +111,8 @@ class AppleTrailersIE(InfoExtractor): formats.append({ 'url': format_url, 'format': format['type'], - 'width': format['width'], - 'height': int(format['height']), + 'width': int_or_none(format['width']), + 'height': int_or_none(format['height']), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 957bdefcb..7f0da8ab6 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -51,6 +51,9 @@ class ARDIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage: + raise ExtractorError('Video %s is no longer available' % video_id, expected=True) + title = self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms.title" content="(.*?)"/>', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 9591bad8a..d86dbba8e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -109,15 +109,19 @@ class ArteTVPlus7IE(InfoExtractor): regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url + # TODO: Might want not to drop videos that does not match requested language + # but to process those formats with lower precedence formats = filter(_match_lang, all_formats) - formats = list(formats) # in python3 filter returns an iterator + formats = list(formats) # in python3 filter returns an iterator if not formats: # Some videos are only available in the 'Originalversion' # they aren't tagged as being in French or German - if all(f['versionCode'] == 'VO' or f['versionCode'] == 'VA' for f in all_formats): - formats = all_formats - else: - raise ExtractorError(u'The formats list is empty') + # Sometimes there are neither videos of requested lang code + # nor original version videos available + # For such cases we just take all_formats as is + formats = all_formats + if not formats: + raise ExtractorError('The formats list is empty') if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: def sort_key(f): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 342bfb8b3..2e6eeac08 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -463,8 +463,9 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') - if secure: regexes = self._og_regexes('video:secure_url') + regexes + regexes = self._og_regexes('video') + self._og_regexes('video:url') + if secure: + regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) def _og_search_url(self, html, **kargs): diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cb8e06822..8049779b0 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -30,7 +30,7 @@ class DFBIE(InfoExtractor): video_id) video_info = player_info.find('video') - f4m_info = self._download_xml(video_info.find('url').text, video_id) + f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id) token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py new file mode 100644 index 000000000..6b651778a --- /dev/null +++ b/youtube_dl/extractor/dump.py @@ -0,0 +1,39 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DumpIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?dump\.com/(?P<id>[a-zA-Z0-9]+)/' + + _TEST = { + 'url': 'http://www.dump.com/oneus/', + 'md5': 'ad71704d1e67dfd9e81e3e8b42d69d99', + 'info_dict': { + 'id': 'oneus', + 'ext': 'flv', + 'title': "He's one of us.", + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex( + r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL') + + thumb = self._og_search_thumbnail(webpage) + title = self._search_regex(r'<b>([^"]+)</b>', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumb, + } diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py new file mode 100644 index 000000000..3e7923648 --- /dev/null +++ b/youtube_dl/extractor/ellentv.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) + + +class EllenTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)' + _TEST = { + 'url': 'http://www.ellentv.com/videos/0-7jqrsr18/', + 'md5': 'e4af06f3bf0d5f471921a18db5764642', + 'info_dict': { + 'id': '0-7jqrsr18', + 'ext': 'mp4', + 'title': 'What\'s Wrong with These Photos? A Whole Lot', + 'timestamp': 1406876400, + 'upload_date': '20140801', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + timestamp = parse_iso8601(self._search_regex( + r'<span class="publish-date"><time datetime="([^"]+)">', + webpage, 'timestamp')) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'url': self._html_search_meta('VideoURL', webpage, 'url'), + 'timestamp': timestamp, + } + + +class EllenTVClipsIE(InfoExtractor): + IE_NAME = 'EllenTV:clips' + _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)' + _TEST = { + 'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', + 'info_dict': { + 'id': 'meryl-streep-vanessa-hudgens', + 'title': 'Meryl Streep, Vanessa Hudgens', + }, + 'playlist_mincount': 9, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + webpage = self._download_webpage(url, playlist_id) + playlist = self._extract_playlist(webpage) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._og_search_title(webpage), + 'entries': self._extract_entries(playlist) + } + + def _extract_playlist(self, webpage): + json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') + try: + return json.loads("[{" + json_string + "}]") + except ValueError as ve: + raise ExtractorError('Failed to download JSON', cause=ve) + + def _extract_entries(self, playlist): + return [self.url_result(item['url'], 'EllenTV') for item in playlist] diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 272dfe1f6..476fc22b9 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,7 +36,7 @@ class EscapistIE(InfoExtractor): r'<meta name="description" content="([^"]*)"', webpage, 'description', fatal=False) - playerUrl = self._og_search_video_url(webpage, name=u'player URL') + playerUrl = self._og_search_video_url(webpage, name='player URL') title = self._html_search_regex( r'<meta name="title" content="([^"]*)"', diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f0cd8f156..f7cf700b5 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,7 +20,7 @@ from ..utils import ( class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:\w+\.)?facebook\.com/ - (?:[^#?]*\#!/)? + (?:[^#]*?\#!/)? (?:video/video\.php|photo\.php|video/embed)\?(?:.*?) (?:v|video_id)=(?P<id>[0-9]+) (?:.*)''' diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py index 6d73c8a4a..af439ccfe 100644 --- a/youtube_dl/extractor/firedrive.py +++ b/youtube_dl/extractor/firedrive.py @@ -42,7 +42,6 @@ class FiredriveIE(InfoExtractor): fields = dict(re.findall(r'''(?x)<input\s+ type="hidden"\s+ name="([^"]+)"\s+ - (?:id="[^"]+"\s+)? value="([^"]*)" ''', webpage)) @@ -66,7 +65,7 @@ class FiredriveIE(InfoExtractor): ext = self._search_regex(r'type:\s?\'([^\']+)\',', webpage, 'extension', fatal=False) video_url = self._search_regex( - r'file:\s?\'(http[^\']+)\',', webpage, 'file url') + r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') formats = [{ 'format_id': 'sd', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bcb076594..8e915735e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -706,6 +706,13 @@ class GenericIE(InfoExtractor): url = unescapeHTML(mobj.group('url')) return self.url_result(url, ie='MTVServicesEmbedded') + # Look for embedded yahoo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Yahoo') + # Start with something easy: JW Player in SWFObject found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if not found: diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py new file mode 100644 index 000000000..68684b997 --- /dev/null +++ b/youtube_dl/extractor/howstuffworks.py @@ -0,0 +1,134 @@ +from __future__ import unicode_literals + +import re +import json +import random +import string + +from .common import InfoExtractor +from ..utils import find_xpath_attr + + +class HowStuffWorksIE(InfoExtractor): + _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm' + _TESTS = [ + { + 'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', + 'info_dict': { + 'id': '450221', + 'display_id': 'cool-jobs-iditarod-musher', + 'ext': 'flv', + 'title': 'Cool Jobs - Iditarod Musher', + 'description': 'md5:82bb58438a88027b8186a1fccb365f90', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # md5 is not consistent + 'skip_download': True + } + }, + { + 'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm', + 'info_dict': { + 'id': '553470', + 'display_id': 'deadliest-catch-jakes-farewell-pots', + 'ext': 'mp4', + 'title': 'Deadliest Catch: Jake\'s Farewell Pots', + 'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # md5 is not consistent + 'skip_download': True + } + }, + { + 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', + 'info_dict': { + 'id': '440011', + 'display_id': 'sword-swallowing-1-by-dan-meyer', + 'ext': 'flv', + 'title': 'Sword Swallowing #1 by Dan Meyer', + 'description': 'md5:b2409e88172913e2e7d3d1159b0ef735', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # md5 is not consistent + 'skip_download': True + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + webpage = self._download_webpage(url, display_id) + + content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id') + + mp4 = self._search_regex( + r'''(?xs)var\s+clip\s*=\s*{\s* + .+?\s* + content_id\s*:\s*%s\s*,\s* + .+?\s* + mp4\s*:\s*\[(.*?),?\]\s* + };\s* + videoData\.push\(clip\);''' % content_id, + webpage, 'mp4', fatal=False, default=None) + + smil = self._download_xml( + 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id, + content_id, 'Downloading video SMIL') + + http_base = find_xpath_attr( + smil, + './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'), + 'name', + 'httpBase').get('content') + + def random_string(str_len=0): + return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)]) + + URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12)) + + formats = [] + + if mp4: + for video in json.loads('[%s]' % mp4): + bitrate = video['bitrate'] + fmt = { + 'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX, + 'format_id': bitrate, + } + m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate) + if m: + fmt['vbr'] = int(m.group('vbr')) + formats.append(fmt) + else: + for video in smil.findall( + './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')): + vbr = int(video.attrib['system-bitrate']) / 1000 + formats.append({ + 'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX), + 'format_id': '%dk' % vbr, + 'vbr': vbr, + }) + + self._sort_formats(formats) + + title = self._og_search_title(webpage) + TITLE_SUFFIX = ' : HowStuffWorks' + if title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': content_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py new file mode 100644 index 000000000..cf73cd753 --- /dev/null +++ b/youtube_dl/extractor/jove.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate +) + + +class JoveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)' + _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}' + _TESTS = [ + { + 'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current', + 'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b', + 'info_dict': { + 'id': '2744', + 'ext': 'mp4', + 'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', + 'description': 'md5:015dd4509649c0908bc27f049e0262c6', + 'thumbnail': 're:^https?://.*\.png$', + 'upload_date': '20110523', + } + }, + { + 'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', + 'md5': '914aeb356f416811d911996434811beb', + 'info_dict': { + 'id': '51796', + 'ext': 'mp4', + 'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', + 'description': 'md5:35ff029261900583970c4023b70f1dc9', + 'thumbnail': 're:^https?://.*\.png$', + 'upload_date': '20140802', + } + }, + + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + chapters_id = self._html_search_regex( + r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id') + + chapters_xml = self._download_xml( + self._CHAPTERS_URL.format(video_id=chapters_id), + video_id, note='Downloading chapters XML', + errnote='Failed to download chapters XML') + + video_url = chapters_xml.attrib.get('video') + if not video_url: + raise ExtractorError('Failed to get the video URL') + + title = self._html_search_meta('citation_title', webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>', + webpage, 'description', fatal=False) + publish_date = unified_strdate(self._html_search_meta( + 'citation_publication_date', webpage, 'publish date', fatal=False)) + comment_count = self._html_search_regex( + r'<meta name="num_comments" content="(\d+) Comments?"', + webpage, 'comment count', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'description': description, + 'upload_date': publish_date, + 'comment_count': comment_count, + } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c0c2d9b09..281a0ce40 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -111,17 +111,28 @@ class LivestreamIE(InfoExtractor): event_name = mobj.group('event_name') webpage = self._download_webpage(url, video_id or event_name) - if video_id is None: - # This is an event page: + og_video = self._og_search_video_url(webpage, 'player url', fatal=False, default=None) + if og_video is None: config_json = self._search_regex( r'window.config = ({.*?});', webpage, 'window config') info = json.loads(config_json)['event'] + + def is_relevant(vdata, vid): + result = vdata['type'] == 'video' + if video_id is not None: + result = result and compat_str(vdata['data']['id']) == vid + return result + videos = [self._extract_video_info(video_data['data']) - for video_data in info['feed']['data'] - if video_data['type'] == 'video'] - return self.playlist_result(videos, info['id'], info['full_name']) + for video_data in info['feed']['data'] + if is_relevant(video_data, video_id)] + if video_id is None: + # This is an event page: + return self.playlist_result(videos, info['id'], info['full_name']) + else: + if videos: + return videos[0] else: - og_video = self._og_search_video_url(webpage, 'player url') query_str = compat_urllib_parse_urlparse(og_video).query query = compat_urlparse.parse_qs(query_str) api_url = query['play_url'][0].replace('.smil', '') diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 6436c05a3..1a896b536 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -9,6 +9,7 @@ from ..utils import ( compat_urllib_request, determine_ext, ExtractorError, + int_or_none, ) @@ -83,6 +84,21 @@ class MetacafeIE(InfoExtractor): 'skip_download': True, }, }, + # Movieclips.com video + { + 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', + 'info_dict': { + 'id': 'mv-Wy7ZU', + 'ext': 'mp4', + 'title': 'My Week with Marilyn - Do You Love Me?', + 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', + 'uploader': 'movie_trailers', + 'duration': 176, + }, + 'params': { + 'skip_download': 'requires rtmpdump', + } + } ] def report_disclaimer(self): @@ -134,6 +150,7 @@ class MetacafeIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) + video_url = None mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: mediaURL = compat_urllib_parse.unquote(mobj.group(1)) @@ -146,16 +163,17 @@ class MetacafeIE(InfoExtractor): else: gdaKey = mobj.group(1) video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: + if video_url is None: mobj = re.search(r'<video src="([^"]+)"', webpage) if mobj: video_url = mobj.group(1) video_ext = 'mp4' - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) + if video_url is None: + flashvars = self._search_regex( + r' name="flashvars" value="(.*?)"', webpage, 'flashvars', + default=None) + if flashvars: + vardict = compat_parse_qs(flashvars) if 'mediaData' not in vardict: raise ExtractorError('Unable to extract media URL') mobj = re.search( @@ -165,26 +183,68 @@ class MetacafeIE(InfoExtractor): mediaURL = mobj.group('mediaURL').replace('\\/', '/') video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) video_ext = determine_ext(video_url) - - video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title') + if video_url is None: + player_url = self._search_regex( + r"swfobject\.embedSWF\('([^']+)'", + webpage, 'config URL', default=None) + if player_url: + config_url = self._search_regex( + r'config=(.+)$', player_url, 'config URL') + config_doc = self._download_xml( + config_url, video_id, + note='Downloading video config') + smil_url = config_doc.find('.//properties').attrib['smil_file'] + smil_doc = self._download_xml( + smil_url, video_id, + note='Downloading SMIL document') + base_url = smil_doc.find('./head/meta').attrib['base'] + video_url = [] + for vn in smil_doc.findall('.//video'): + br = int(vn.attrib['system-bitrate']) + play_path = vn.attrib['src'] + video_url.append({ + 'format_id': 'smil-%d' % br, + 'url': base_url, + 'play_path': play_path, + 'page_url': url, + 'player_url': player_url, + 'ext': play_path.partition(':')[0], + }) + + if video_url is None: + raise ExtractorError('Unsupported video type') + + video_title = self._html_search_regex( + r'(?im)<title>(.*) - Video</title>', webpage, 'title') description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, 'uploader nickname', fatal=False) + duration = int_or_none( + self._html_search_meta('video:duration', webpage)) + + age_limit = ( + 18 + if re.search(r'"contentRating":"restricted"', webpage) + else 0) - if re.search(r'"contentRating":"restricted"', webpage) is not None: - age_limit = 18 + if isinstance(video_url, list): + formats = video_url else: - age_limit = 0 + formats = [{ + 'url': video_url, + 'ext': video_ext, + }] + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, 'description': description, 'uploader': video_uploader, 'title': video_title, - 'thumbnail':thumbnail, - 'ext': video_ext, + 'thumbnail': thumbnail, 'age_limit': age_limit, + 'formats': formats, + 'duration': duration, } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py new file mode 100644 index 000000000..979f3d692 --- /dev/null +++ b/youtube_dl/extractor/mitele.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + get_element_by_attribute, + parse_duration, + strip_jsonp, +) + + +class MiTeleIE(InfoExtractor): + IE_NAME = 'mitele.es' + _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/' + + _TEST = { + 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', + 'md5': '6a75fe9d0d3275bead0cb683c616fddb', + 'info_dict': { + 'id': '0fce117d', + 'ext': 'mp4', + 'title': 'Programa 144 - Tor, la web invisible', + 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'display_id': 'programa-144', + 'duration': 2913, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = mobj.group('episode') + webpage = self._download_webpage(url, episode) + embed_data_json = self._search_regex( + r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', + flags=re.DOTALL + ).replace('\'', '"') + embed_data = json.loads(embed_data_json) + + info_url = embed_data['flashvars']['host'] + info_el = self._download_xml(info_url, episode).find('./video/info') + + video_link = info_el.find('videoUrl/link').text + token_query = compat_urllib_parse.urlencode({'id': video_link}) + token_info = self._download_json( + 'http://token.mitele.es/?' + token_query, episode, + transform_source=strip_jsonp + ) + + return { + 'id': embed_data['videoId'], + 'display_id': episode, + 'title': info_el.find('title').text, + 'url': token_info['tokenizedUrl'], + 'description': get_element_by_attribute('class', 'text', webpage), + 'thumbnail': info_el.find('thumb').text, + 'duration': parse_duration(info_el.find('duration').text), + } diff --git a/youtube_dl/extractor/mojvideo.py b/youtube_dl/extractor/mojvideo.py new file mode 100644 index 000000000..90b460d65 --- /dev/null +++ b/youtube_dl/extractor/mojvideo.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_duration, +) + + +class MojvideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mojvideo\.com/video-(?P<display_id>[^/]+)/(?P<id>[a-f0-9]+)' + _TEST = { + 'url': 'http://www.mojvideo.com/video-v-avtu-pred-mano-rdecelaska-alfi-nipic/3d1ed4497707730b2906', + 'md5': 'f7fd662cc8ce2be107b0d4f2c0483ae7', + 'info_dict': { + 'id': '3d1ed4497707730b2906', + 'display_id': 'v-avtu-pred-mano-rdecelaska-alfi-nipic', + 'ext': 'mp4', + 'title': 'V avtu pred mano rdečelaska - Alfi Nipič', + 'thumbnail': 're:^http://.*\.jpg$', + 'duration': 242, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + # XML is malformed + playerapi = self._download_webpage( + 'http://www.mojvideo.com/playerapi.php?v=%s&t=1' % video_id, display_id) + + if '<error>true</error>' in playerapi: + error_desc = self._html_search_regex( + r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) + + title = self._html_search_regex( + r'<title>([^<]+)</title>', playerapi, 'title') + video_url = self._html_search_regex( + r'<file>([^<]+)</file>', playerapi, 'video URL') + thumbnail = self._html_search_regex( + r'<preview>([^<]+)</preview>', playerapi, 'thumbnail', fatal=False) + duration = parse_duration(self._html_search_regex( + r'<duration>([^<]+)</duration>', playerapi, 'duration', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 1c5e9401f..6b2f3f55a 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,3 +1,4 @@ +# encoding: utf-8 from __future__ import unicode_literals import re @@ -8,19 +9,34 @@ from ..utils import ExtractorError class NownessIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])' - - _TEST = { - 'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', - 'md5': '068bc0202558c2e391924cb8cc470676', - 'info_dict': { - 'id': '2520295746001', - 'ext': 'mp4', - 'description': 'Candor: The Art of Gesticulation', - 'uploader': 'Nowness', - 'title': 'Candor: The Art of Gesticulation', - } - } + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])' + + _TESTS = [ + { + 'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', + 'md5': '068bc0202558c2e391924cb8cc470676', + 'info_dict': { + 'id': '2520295746001', + 'ext': 'mp4', + 'title': 'Candor: The Art of Gesticulation', + 'description': 'Candor: The Art of Gesticulation', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'Nowness', + } + }, + { + 'url': 'http://cn.nowness.com/day/2014/8/7/4069/kasper-bj-rke-ft-jaakko-eino-kalevi--tnr', + 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', + 'info_dict': { + 'id': '3716354522001', + 'ext': 'mp4', + 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', + 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'Nowness', + } + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/oe1.py b/youtube_dl/extractor/oe1.py deleted file mode 100644 index 38971ab4d..000000000 --- a/youtube_dl/extractor/oe1.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import calendar -import datetime -import re - -from .common import InfoExtractor - -# audios on oe1.orf.at are only available for 7 days, so we can't -# add tests. - - -class OE1IE(InfoExtractor): - IE_DESC = 'oe1.orf.at' - _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_id = mobj.group('id') - - data = self._download_json( - 'http://oe1.orf.at/programm/%s/konsole' % show_id, - show_id - ) - - timestamp = datetime.datetime.strptime('%s %s' % ( - data['item']['day_label'], - data['item']['time'] - ), '%d.%m.%Y %H:%M') - unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - - return { - 'id': show_id, - 'title': data['item']['title'], - 'url': data['item']['url_stream'], - 'ext': 'mp3', - 'description': data['item'].get('info'), - 'timestamp': unix_timestamp - } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 13f12824c..2044e107e 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -3,23 +3,38 @@ import re import json from .common import InfoExtractor -from ..utils import unescapeHTML +from ..utils import ( + unescapeHTML, + ExtractorError, +) class OoyalaIE(InfoExtractor): _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' - _TEST = { - # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video - 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c', - 'info_dict': { - 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'ext': 'mp4', - 'title': 'Explaining Data Recovery from Hard Drives and SSDs', - 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + _TESTS = [ + { + # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video + 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c', + 'info_dict': { + 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + 'ext': 'mp4', + 'title': 'Explaining Data Recovery from Hard Drives and SSDs', + 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + }, + }, { + # Only available for ipad + 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', + 'md5': '4b9754921fddb68106e48c142e2a01e6', + 'info_dict': { + 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', + 'ext': 'mp4', + 'title': 'Simulation Overview - Levels of Simulation', + 'description': '', + }, }, - } + ] @staticmethod def _url_for_embed_code(embed_code): @@ -47,13 +62,30 @@ class OoyalaIE(InfoExtractor): player = self._download_webpage(player_url, embedCode) mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', player, 'mobile player url') - mobile_player = self._download_webpage(mobile_url, embedCode) - videos_info = self._search_regex( - r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, 'info').replace('\\"','"') - videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"') + # Looks like some videos are only available for particular devices + # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 + # is only available for ipad) + # Working around with fetching URLs for all the devices found starting with 'unknown' + # until we succeed or eventually fail for each device. + devices = re.findall(r'device\s*=\s*"([^"]+)";', player) + devices.remove('unknown') + devices.insert(0, 'unknown') + for device in devices: + mobile_player = self._download_webpage( + '%s&device=%s' % (mobile_url, device), embedCode, + 'Downloading mobile player JS for %s device' % device) + videos_info = self._search_regex( + r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', + mobile_player, 'info', fatal=False, default=None) + if videos_info: + break + if not videos_info: + raise ExtractorError('Unable to extract info') + videos_info = videos_info.replace('\\"', '"') + videos_more_info = self._search_regex( + r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') videos_info = json.loads(videos_info) - videos_more_info =json.loads(videos_more_info) + videos_more_info = json.loads(videos_more_info) if videos_more_info.get('lineup'): videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 03421d1d5..011e6be13 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import json import re +import calendar +import datetime from .common import InfoExtractor from ..utils import ( @@ -12,7 +14,9 @@ from ..utils import ( ) -class ORFIE(InfoExtractor): +class ORFTVthekIE(InfoExtractor): + IE_NAME = 'orf:tvthek' + IE_DESC = 'ORF TVthek' _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)' _TEST = { @@ -105,3 +109,73 @@ class ORFIE(InfoExtractor): 'entries': entries, 'id': playlist_id, } + + +# Audios on ORF radio are only available for 7 days, so we can't add tests. + + +class ORFOE1IE(InfoExtractor): + IE_NAME = 'orf:oe1' + IE_DESC = 'Radio Österreich 1' + _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('id') + + data = self._download_json( + 'http://oe1.orf.at/programm/%s/konsole' % show_id, + show_id + ) + + timestamp = datetime.datetime.strptime('%s %s' % ( + data['item']['day_label'], + data['item']['time'] + ), '%d.%m.%Y %H:%M') + unix_timestamp = calendar.timegm(timestamp.utctimetuple()) + + return { + 'id': show_id, + 'title': data['item']['title'], + 'url': data['item']['url_stream'], + 'ext': 'mp3', + 'description': data['item'].get('info'), + 'timestamp': unix_timestamp + } + + +class ORFFM4IE(InfoExtractor): + IE_DESC = 'orf:fm4' + IE_DESC = 'radio FM4' + _VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + show_date = mobj.group('date') + show_id = mobj.group('show') + + data = self._download_json( + 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), + show_id + ) + + def extract_entry_dict(info, title, subtitle): + return { + 'id': info['loopStreamId'].replace('.mp3', ''), + 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], + 'title': title, + 'description': subtitle, + 'duration': (info['end'] - info['start']) / 1000, + 'timestamp': info['start'] / 1000, + 'ext': 'mp3' + } + + entries = [extract_entry_dict(t, data['title'], data['subtitle']) for t in data['streams']] + + return { + '_type': 'playlist', + 'id': show_id, + 'title': data['title'], + 'description': data['subtitle'], + 'entries': entries + }
\ No newline at end of file diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index ec95d0704..dee4af6f1 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -20,17 +20,41 @@ class PBSIE(InfoExtractor): ) ''' - _TEST = { - 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', - 'info_dict': { - 'id': '2365006249', - 'ext': 'mp4', - 'title': 'A More Perfect Union', - 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', - 'duration': 3190, + _TESTS = [ + { + 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', + 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'info_dict': { + 'id': '2365006249', + 'ext': 'mp4', + 'title': 'A More Perfect Union', + 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', + 'duration': 3190, + }, + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', + 'md5': '143c98aa54a346738a3d78f54c925321', + 'info_dict': { + 'id': '2365297690', + 'ext': 'mp4', + 'title': 'Losing Iraq', + 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'duration': 5050, + }, }, - } + { + 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', + 'md5': 'b19856d7f5351b17a5ab1dc6a64be633', + 'info_dict': { + 'id': '2201174722', + 'ext': 'mp4', + 'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist', + 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', + 'duration': 801, + }, + }, + ] def _extract_ids(self, url): mobj = re.match(self._VALID_URL, url) @@ -40,10 +64,13 @@ class PBSIE(InfoExtractor): if presumptive_id: webpage = self._download_webpage(url, display_id) - # frontline video embed + MEDIA_ID_REGEXES = [ + r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed + r'class="coveplayerid">([^<]+)<', # coveplayer + ] + media_id = self._search_regex( - r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", - webpage, 'frontline video ID', fatal=False, default=None) + MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: return media_id, presumptive_id diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py index 49cf427a1..ec7e7df7b 100644 --- a/youtube_dl/extractor/reverbnation.py +++ b/youtube_dl/extractor/reverbnation.py @@ -1,23 +1,23 @@ from __future__ import unicode_literals import re -import time from .common import InfoExtractor -from ..utils import strip_jsonp +from ..utils import str_or_none class ReverbNationIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', - 'file': '16965047.mp3', 'md5': '3da12ebca28c67c111a7f8b262d3f7a7', 'info_dict': { + "id": "16965047", + "ext": "mp3", "title": "MONA LISA", "uploader": "ALKILADOS", - "uploader_id": 216429, - "thumbnail": "//gp1.wac.edgecastcdn.net/802892/production_public/Photo/13761700/image/1366002176_AVATAR_MONA_LISA.jpg" + "uploader_id": "216429", + "thumbnail": "re:^https://gp1\.wac\.edgecastcdn\.net/.*?\.jpg$" }, }] @@ -26,10 +26,8 @@ class ReverbNationIE(InfoExtractor): song_id = mobj.group('id') api_res = self._download_json( - 'https://api.reverbnation.com/song/%s?callback=api_response_5&_=%d' - % (song_id, int(time.time() * 1000)), + 'https://api.reverbnation.com/song/%s' % song_id, song_id, - transform_source=strip_jsonp, note='Downloading information of song %s' % song_id ) @@ -38,8 +36,9 @@ class ReverbNationIE(InfoExtractor): 'title': api_res.get('name'), 'url': api_res.get('url'), 'uploader': api_res.get('artist', {}).get('name'), - 'uploader_id': api_res.get('artist', {}).get('id'), - 'thumbnail': api_res.get('image', api_res.get('thumbnail')), + 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), + 'thumbnail': self._proto_relative_url( + api_res.get('image', api_res.get('thumbnail'))), 'ext': 'mp3', 'vcodec': 'none', } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py new file mode 100644 index 000000000..14928cd62 --- /dev/null +++ b/youtube_dl/extractor/rtlnl.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RtlXlIE(InfoExtractor): + IE_NAME = 'rtlxl.nl' + _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' + + _TEST = { + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', + 'info_dict': { + 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', + 'ext': 'flv', + 'title': 'RTL Nieuws - Laat', + 'description': 'Dagelijks het laatste nieuws uit binnen- en ' + 'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van ' + 'onze mobiele apps.', + 'timestamp': 1408051800, + 'upload_date': '20140814', + }, + 'params': { + # We download the first bytes of the first fragment, it can't be + # processed by the f4m downloader beacuse it isn't complete + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uuid = mobj.group('uuid') + + info = self._download_json( + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, + uuid) + meta = info['meta'] + material = info['material'][0] + episode_info = info['episodes'][0] + + f4m_url = 'http://manifest.us.rtl.nl' + material['videopath'] + progname = info['abstracts'][0]['name'] + subtitle = material['title'] or info['episodes'][0]['name'] + + return { + 'id': uuid, + 'title': '%s - %s' % (progname, subtitle), + 'formats': self._extract_f4m_formats(f4m_url, uuid), + 'timestamp': material['original_date'], + 'description': episode_info['synopsis'], + } diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 8607482be..badba2ac6 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -17,11 +17,11 @@ class SharedIE(InfoExtractor): _TEST = { 'url': 'http://shared.sx/0060718775', - 'md5': '53e1c58fc3e777ae1dfe9e57ba2f9c72', + 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { 'id': '0060718775', 'ext': 'mp4', - 'title': 'Big Buck Bunny Trailer', + 'title': 'Bmp4', }, } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f8dd7e955..fa796ce72 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor): video_id = mobj.group("video_id") if not video_id: video_id = self._html_search_regex( - r'<article class="video" data-id="(\d+?)"', + r'data-node-id="(\d+?)"', webpage, 'video id') data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 10844f39e..11c7d7e81 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -276,7 +276,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): if video_thumbnail is None: video_thumbs = config["video"].get("thumbs") if video_thumbs and isinstance(video_thumbs, dict): - _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1] + _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description video_description = None diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index f1b9e9a19..2544c24bd 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + compat_str, +) class VubeIE(InfoExtractor): @@ -29,6 +31,7 @@ class VubeIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'comment_count': int, + 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], } }, { @@ -47,6 +50,7 @@ class VubeIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'comment_count': int, + 'categories': ['seraina', 'jessica', 'krewella', 'alive'], } }, { 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', @@ -56,13 +60,15 @@ class VubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Frozen - Let It Go Cover by Siren Gene', 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', - 'uploader': 'Siren Gene', - 'uploader_id': 'Siren', 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', + 'uploader': 'Siren', + 'timestamp': 1395448018, + 'upload_date': '20140322', 'duration': 221.788, 'like_count': int, 'dislike_count': int, 'comment_count': int, + 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], } } ] @@ -71,47 +77,40 @@ class VubeIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - data_json = self._search_regex( - r'(?s)window\["(?:tapiVideoData|vubeOriginalVideoData)"\]\s*=\s*(\{.*?\n});\n', - webpage, 'video data' - ) - data = json.loads(data_json) - video = ( - data.get('video') or - data) - assert isinstance(video, dict) + video = self._download_json( + 'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON') public_id = video['public_id'] - formats = [ - { - 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id), - 'height': int(fmt['height']), - 'abr': int(fmt['audio_bitrate']), - 'vbr': int(fmt['video_bitrate']), - 'format_id': fmt['media_resolution_id'] - } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed' - ] + formats = [] + + for media in video['media'].get('video', []) + video['media'].get('audio', []): + if media['transcoding_status'] != 'processed': + continue + fmt = { + 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id), + 'abr': int(media['audio_bitrate']), + 'format_id': compat_str(media['media_resolution_id']), + } + vbr = int(media['video_bitrate']) + if vbr: + fmt.update({ + 'vbr': vbr, + 'height': int(media['height']), + }) + formats.append(fmt) self._sort_formats(formats) title = video['title'] description = video.get('description') - thumbnail = self._proto_relative_url( - video.get('thumbnail') or video.get('thumbnail_src'), - scheme='http:') - uploader = data.get('user', {}).get('channel', {}).get('name') or video.get('user_alias') - uploader_id = data.get('user', {}).get('name') + thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') + uploader = video.get('user_alias') or video.get('channel') timestamp = int_or_none(video.get('upload_time')) duration = video['duration'] view_count = video.get('raw_view_count') - like_count = video.get('rlikes') - if like_count is None: - like_count = video.get('total_likes') - dislike_count = video.get('rhates') - if dislike_count is None: - dislike_count = video.get('total_hates') + like_count = video.get('total_likes') + dislike_count = video.get('total_hates') comments = video.get('comments') comment_count = None @@ -124,6 +123,8 @@ class VubeIE(InfoExtractor): else: comment_count = len(comments) + categories = [tag['text'] for tag in video['tags']] + return { 'id': video_id, 'formats': formats, @@ -131,11 +132,11 @@ class VubeIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, - 'uploader_id': uploader_id, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, + 'categories': categories, } diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py new file mode 100644 index 000000000..a9aa72e73 --- /dev/null +++ b/youtube_dl/extractor/xboxclips.py @@ -0,0 +1,57 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + float_or_none, + int_or_none, +) + + +class XboxClipsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/video\.php\?.*vid=(?P<id>[\w-]{36})' + _TEST = { + 'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', + 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', + 'info_dict': { + 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', + 'ext': 'mp4', + 'title': 'Iabdulelah playing Upload Studio', + 'filesize_approx': 28101836.8, + 'timestamp': 1407388500, + 'upload_date': '20140807', + 'duration': 56, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'>Link: <a href="([^"]+)">', webpage, 'video URL') + title = self._html_search_regex( + r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title') + timestamp = parse_iso8601(self._html_search_regex( + r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + filesize = float_or_none(self._html_search_regex( + r'>Size: ([\d\.]+)MB<', webpage, 'file size', fatal=False), invscale=1024 * 1024) + duration = int_or_none(self._html_search_regex( + r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._html_search_regex( + r'>Views: (\d+)<', webpage, 'view count', fatal=False)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'timestamp': timestamp, + 'filesize_approx': filesize, + 'duration': duration, + 'view_count': view_count, + } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d84be2562..0e3b33b16 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,7 +15,7 @@ from ..utils import ( class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html' + _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -46,12 +46,23 @@ class YahooIE(InfoExtractor): 'title': 'The World Loves Spider-Man', 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', } - } + }, + { + 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', + 'md5': '60e8ac193d8fb71997caa8fce54c6460', + 'info_dict': { + 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', + 'ext': 'mp4', + 'title': "Yahoo Saves 'Community'", + 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', + } + }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + url = mobj.group('url') webpage = self._download_webpage(url, video_id) items_json = self._search_regex( diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3c9b0b584..2c44f36a5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -225,7 +225,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # RTMP (unnamed) @@ -374,6 +374,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return lambda s: u''.join(s[i] for i in cache_spec) except IOError: pass # No cache available + except ValueError: + try: + file_size = os.path.getsize(cache_fn) + except (OSError, IOError) as oe: + file_size = str(oe) + self._downloader.report_warning( + u'Cache %s failed (%s)' % (cache_fn, file_size)) if player_type == 'js': code = self._download_webpage( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e40b367c2..f8ec5389f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -24,6 +24,7 @@ import socket import struct import subprocess import sys +import tempfile import traceback import xml.etree.ElementTree import zlib @@ -228,18 +229,42 @@ else: assert type(s) == type(u'') print(s) -# In Python 2.x, json.dump expects a bytestream. -# In Python 3.x, it writes to a character stream -if sys.version_info < (3,0): - def write_json_file(obj, fn): - with open(fn, 'wb') as f: - json.dump(obj, f) -else: - def write_json_file(obj, fn): - with open(fn, 'w', encoding='utf-8') as f: - json.dump(obj, f) -if sys.version_info >= (2,7): +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically """ + + args = { + 'suffix': '.tmp', + 'prefix': os.path.basename(fn) + '.', + 'dir': os.path.dirname(fn), + 'delete': False, + } + + # In Python 2.x, json.dump expects a bytestream. + # In Python 3.x, it writes to a character stream + if sys.version_info < (3, 0): + args['mode'] = 'wb' + else: + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**args) + + try: + with tf: + json.dump(obj, tf) + os.rename(tf.name, fn) + except: + try: + os.remove(tf.name) + except OSError: + pass + raise + + +if sys.version_info >= (2, 7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) @@ -827,6 +852,7 @@ def unified_strdate(date_str): '%b %dnd %Y %I:%M%p', '%b %dth %Y %I:%M%p', '%Y-%m-%d', + '%Y/%m/%d', '%d.%m.%Y', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', @@ -1273,9 +1299,15 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) + if v == '': + v = None return default if v is None else (int(v) * invscale // scale) +def str_or_none(v, default=None): + return default if v is None else compat_str(v) + + def str_to_int(int_str): if int_str is None: return None diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 401fa3d10..15b9d6c61 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.08.02.1' +__version__ = '2014.08.21.3' |