diff options
Diffstat (limited to 'youtube_dl/extractor')
25 files changed, 601 insertions, 201 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0dd763006..8e31de93d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,12 +20,14 @@ from .arte import ( ArteTVDDCIE, ArteTVEmbedIE, ) +from .audiomack import AudiomackIE from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE @@ -137,6 +139,7 @@ from .gamestar import GameStarIE from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .glide import GlideIE from .globo import GloboIE from .godtube import GodTubeIE from .golem import GolemIE @@ -370,6 +373,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .telecinco import TelecincoIE from .telemb import TeleMBIE from .tenplay import TenPlayIE from .testurl import TestURLIE @@ -424,6 +428,7 @@ from .videopremium import VideoPremiumIE from .videott import VideoTtIE from .videoweed import VideoWeedIE from .vidme import VidmeIE +from .vidzi import VidziIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -443,6 +448,7 @@ from .viki import VikiIE from .vk import VKIE from .vodlocker import VodlockerIE from .vporn import VpornIE +from .vrt import VRTIE from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE @@ -492,10 +498,8 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) - from .zdf import ZDFIE - _ALL_CLASSES = [ klass for name, klass in globals().items() diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 3a34d1ecc..b9a9440c0 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,8 +10,8 @@ from ..utils import ( unified_strdate, determine_ext, get_element_by_id, - compat_str, get_element_by_attribute, + int_or_none, ) # There are different sources of video in arte.tv, the extraction process @@ -90,15 +90,24 @@ class ArteTVPlus7IE(InfoExtractor): if not upload_date_str: upload_date_str = player_info.get('VDA', '').split(' ')[0] + title = player_info['VTI'].strip() + subtitle = player_info.get('VSU', '').strip() + if subtitle: + title += ' - %s' % subtitle + info_dict = { 'id': player_info['VID'], - 'title': player_info['VTI'], + 'title': title, 'description': player_info.get('VDE'), 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } - all_formats = player_info['VSR'].values() + all_formats = [] + for format_id, format_dict in player_info['VSR'].items(): + fmt = dict(format_dict) + fmt['format_id'] = format_id + all_formats.append(fmt) # Some formats use the m3u8 protocol all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) def _match_lang(f): @@ -149,25 +158,12 @@ class ArteTVPlus7IE(InfoExtractor): ) formats = sorted(formats, key=sort_key) def _format(format_info): - quality = '' - height = format_info.get('height') - if height is not None: - quality = compat_str(height) - bitrate = format_info.get('bitrate') - if bitrate is not None: - quality += '-%d' % bitrate - if format_info.get('versionCode') is not None: - format_id = '%s-%s' % (quality, format_info['versionCode']) - else: - format_id = quality - media_type = format_info.get('mediaType') - if media_type is not None: - format_id += '-%s' % media_type info = { - 'format_id': format_id, - 'format_note': format_info.get('versionLibelle'), - 'width': format_info.get('width'), - 'height': height, + 'format_id': format_info['format_id'], + 'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')), + 'width': int_or_none(format_info.get('width')), + 'height': int_or_none(format_info.get('height')), + 'tbr': int_or_none(format_info.get('bitrate')), } if format_info['mediaType'] == 'rtmp': info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py new file mode 100644 index 000000000..57446fddd --- /dev/null +++ b/youtube_dl/extractor/audiomack.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .soundcloud import SoundcloudIE +from ..utils import ExtractorError +import datetime +import time + + +class AudiomackIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' + IE_NAME = 'audiomack' + _TESTS = [ + #hosted on audiomack + { + 'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary', + 'info_dict': + { + 'id' : 'roosh-williams/extraordinary', + 'ext': 'mp3', + 'title': 'Roosh Williams - Extraordinary' + } + }, + #hosted on soundcloud via audiomack + { + 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', + 'file': '172419696.mp3', + 'info_dict': + { + 'ext': 'mp3', + 'title': 'Young Thug ft Lil Wayne - Take Kare', + "upload_date": "20141016", + "description": "New track produced by London On Da Track called “Take Kare\"\n\nhttp://instagram.com/theyoungthugworld\nhttps://www.facebook.com/ThuggerThuggerCashMoney\n", + "uploader": "Young Thug World" + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_response = self._download_json( + "http://www.audiomack.com/api/music/url/song/%s?_=%d" % ( + video_id, time.time()), + video_id) + + if "url" not in api_response: + raise ExtractorError("Unable to deduce api url of song") + realurl = api_response["url"] + + #Audiomack wraps a lot of soundcloud tracks in their branded wrapper + # - if so, pass the work off to the soundcloud extractor + if SoundcloudIE.suitable(realurl): + return {'_type': 'url', 'url': realurl, 'ie_key': 'Soundcloud'} + + webpage = self._download_webpage(url, video_id) + artist = self._html_search_regex( + r'<span class="artist">(.*?)</span>', webpage, "artist") + songtitle = self._html_search_regex( + r'<h1 class="profile-title song-title"><span class="artist">.*?</span>(.*?)</h1>', + webpage, "title") + title = artist + " - " + songtitle + + return { + 'id': video_id, + 'title': title, + 'url': realurl, + } diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py new file mode 100644 index 000000000..0269d1174 --- /dev/null +++ b/youtube_dl/extractor/bild.py @@ -0,0 +1,39 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class BildIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' + IE_DESC = 'Bild.de' + _TEST = { + 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', + 'md5': 'dd495cbd99f2413502a1713a1156ac8a', + 'info_dict': { + 'id': '38184146', + 'ext': 'mp4', + 'title': 'BILD hat sie getestet', + 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', + 'duration': 196, + 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" + doc = self._download_xml(xml_url, video_id) + + duration = int_or_none(doc.attrib.get('duration'), scale=1000) + + return { + 'id': video_id, + 'title': doc.attrib['ueberschrift'], + 'description': doc.attrib.get('text'), + 'url': doc.attrib['src'], + 'thumbnail': doc.attrib.get('img'), + 'duration': duration, + } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 496271be4..d064a28f9 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,7 +42,7 @@ class CinemassacreIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) if not mobj: raise ExtractorError('Can\'t extract embed url and video id') playerdata_url = mobj.group('embed_url') @@ -53,17 +53,22 @@ class CinemassacreIE(InfoExtractor): video_description = self._html_search_regex( r'<div class="entry-content">(?P<description>.+?)</div>', webpage, 'description', flags=re.DOTALL, fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') - video_thumbnail = self._search_regex( - r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) - sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') - videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') + vidurl = self._search_regex( + r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') + vidid = self._search_regex( + r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') + videoserver = self._html_search_regex( + r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') + + videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') formats = [] - baseurl = sd_url[:sd_url.rfind('/')+1] + baseurl = vidurl[:vidurl.rfind('/')+1] for video in videolist.findall('.//video'): src = video.get('src') if not src: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e8366f7f9..e1bd6bb49 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -89,6 +89,10 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. * http_referer HTTP Referer header value to set. * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers @@ -238,7 +242,6 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] @@ -247,6 +250,10 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -305,7 +312,7 @@ class InfoExtractor(object): msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) - return (content, urlh) + return content def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the data of the page as a string """ @@ -613,12 +620,13 @@ class InfoExtractor(object): audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, + f.get('source_preference') if f.get('source_preference') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) def http_scheme(self): - """ Either "https:" or "https:", depending on the user's preferences """ + """ Either "http:" or "https:", depending on the user's preferences """ return ( 'http:' if self._downloader.params.get('prefer_insecure', False) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9ac86c2be..2dca52660 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -40,6 +40,7 @@ class CrunchyrollIE(SubtitlesInfoExtractor): 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', + 'url': 're:(?!.*&)', }, 'params': { # rtmp @@ -238,12 +239,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) - streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) - video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') - video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') + streamdata = self._download_xml( + streamdata_req, video_id, + note='Downloading media info for %s' % video_format) + video_url = streamdata.find('.//host').text + video_play_path = streamdata.find('.//file').text formats.append({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_play_path, 'ext': 'flv', 'format': video_format, 'format_id': video_format, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 0b3374d97..566e20d76 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -46,7 +46,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): f4m_format['preference'] = 1 formats.extend(f4m_formats) elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -58,7 +58,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): formats.append({ 'url': video_url, 'format_id': format_id, - 'preference': 2, + 'preference': -1, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index d966e8403..ec6d96ada 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -37,7 +37,7 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage) + links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage) if not links: raise ExtractorError('No media links available for %s' % video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9057a6beb..51dbbc8db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -380,6 +380,17 @@ class GenericIE(InfoExtractor): 'uploader': 'education-portal.com', }, }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + }, + }, ] def report_following_redirect(self, new_url): @@ -476,7 +487,8 @@ class GenericIE(InfoExtractor): 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: - assert ':' in default_search + if ':' not in default_search: + default_search += ':' return self.url_result(default_search + url) url, smuggled_data = unsmuggle_url(url) @@ -491,14 +503,14 @@ class GenericIE(InfoExtractor): self.to_screen('%s: Requesting header' % video_id) head_req = HEADRequest(url) - response = self._request_webpage( + head_response = self._request_webpage( head_req, video_id, note=False, errnote='Could not send HEAD request to %s' % url, fatal=False) - if response is not False: + if head_response is not False: # Check for redirect - new_url = response.geturl() + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -506,34 +518,35 @@ class GenericIE(InfoExtractor): new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) - # Check for direct link to a video - content_type = response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) - if m: - upload_date = response.headers.get('Last-Modified') - if upload_date: - upload_date = unified_strdate(upload_date) - return { - 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], - 'formats': [{ - 'format_id': m.group('format_id'), - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }], - 'upload_date': upload_date, - } + full_response = None + if head_response is False: + full_response = self._request_webpage(url, video_id) + head_response = full_response + + # Check for direct link to a video + content_type = head_response.headers.get('Content-Type', '') + m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) + if m: + upload_date = unified_strdate( + head_response.headers.get('Last-Modified')) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + 'vcodec': 'none' if m.group('type') == 'audio' else None + }], + 'upload_date': upload_date, + } if not self._downloader.params.get('test', False) and not is_intentional: self._downloader.report_warning('Falling back on generic information extractor.') - try: + if full_response: + webpage = _webpage_read_content(url, video_id) + else: webpage = self._download_webpage(url, video_id) - except ValueError: - # since this is the last-resort InfoExtractor, if - # this error is thrown, it'll be thrown here - raise ExtractorError('Failed to download URL: %s' % url) - self.report_extraction(video_id) # Is it an RSS feed? @@ -623,7 +636,8 @@ class GenericIE(InfoExtractor): <iframe[^>]+?src=| data-video-url=| <embed[^>]+?src=| - embedSWF\(?:\s* + embedSWF\(?:\s*| + new\s+SWFObject\( ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ @@ -652,7 +666,7 @@ class GenericIE(InfoExtractor): # Look for embedded Wistia player match = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) if match: embed_url = self._proto_relative_url( unescapeHTML(match.group('url'))) @@ -664,6 +678,7 @@ class GenericIE(InfoExtractor): 'title': video_title, 'id': video_id, } + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) if match: return { diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py new file mode 100644 index 000000000..9561ed5fb --- /dev/null +++ b/youtube_dl/extractor/glide.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class GlideIE(InfoExtractor): + IE_DESC = 'Glide mobile video messages (glide.me)' + _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)' + _TEST = { + 'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==', + 'md5': '4466372687352851af2d131cfaa8a4c7', + 'info_dict': { + 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', + 'ext': 'mp4', + 'title': 'Damon Timm\'s Glide message', + 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'title') + video_url = self.http_scheme() + self._search_regex( + r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL') + thumbnail_url = self._search_regex( + r'<img id="video-thumbnail" src="(.*?)"', + webpage, 'thumbnail url', fatal=False) + thumbnail = ( + thumbnail_url if thumbnail_url is None + else self.http_scheme() + thumbnail_url) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index 5bdd08afa..b6cc15b6f 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,37 +1,33 @@ # -*- coding: utf-8 -*- - -import re -import json +from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext + class HarkIE(InfoExtractor): - _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+' _TEST = { - u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', - u'file': u'mmbzyhkgny.mp3', - u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', - u'info_dict': { - u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", - u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', - u'duration': 11, + 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + 'md5': '6783a58491b47b92c7c1af5a77d4cbee', + 'info_dict': { + 'id': 'mmbzyhkgny', + 'ext': 'mp3', + 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', + 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + 'duration': 11, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - json_url = "http://www.hark.com/clips/%s.json" %(video_id) - info_json = self._download_webpage(json_url, video_id) - info = json.loads(info_json) - final_url = info['url'] + video_id = self._match_id(url) + data = self._download_json( + 'http://www.hark.com/clips/%s.json' % video_id, video_id) - return {'id': video_id, - 'url' : final_url, - 'title': info['name'], - 'ext': determine_ext(final_url), - 'description': info['description'], - 'thumbnail': info['image_original'], - 'duration': info['duration'], - } + return { + 'id': video_id, + 'url': data['url'], + 'title': data['name'], + 'description': data.get('description'), + 'thumbnail': data.get('image_original'), + 'duration': data.get('duration'), + } diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index fca0bfef0..db5df4078 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -22,7 +22,7 @@ class LRTIE(InfoExtractor): 'id': '54391', 'ext': 'mp4', 'title': 'Septynios Kauno dienos', - 'description': 'Kauno miesto ir apskrities naujienos', + 'description': 'md5:24d84534c7dc76581e59f5689462411a', 'duration': 1783, }, 'params': { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 979f3d692..6691521e5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -6,6 +6,7 @@ import json from .common import InfoExtractor from ..utils import ( compat_urllib_parse, + compat_urlparse, get_element_by_attribute, parse_duration, strip_jsonp, @@ -39,13 +40,21 @@ class MiTeleIE(InfoExtractor): ).replace('\'', '"') embed_data = json.loads(embed_data_json) - info_url = embed_data['flashvars']['host'] + domain = embed_data['mediaUrl'] + if not domain.startswith('http'): + # only happens in telecinco.es videos + domain = 'http://' + domain + info_url = compat_urlparse.urljoin( + domain, + compat_urllib_parse.unquote(embed_data['flashvars']['host']) + ) info_el = self._download_xml(info_url, episode).find('./video/info') video_link = info_el.find('videoUrl/link').text token_query = compat_urllib_parse.urlencode({'id': video_link}) token_info = self._download_json( - 'http://token.mitele.es/?' + token_query, episode, + embed_data['flashvars']['ov_tk'] + '?' + token_query, + episode, transform_source=strip_jsonp ) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6229b2173..3621ff99e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,20 +5,20 @@ import re from .common import InfoExtractor from ..utils import ( - int_or_none, + str_to_int, unified_strdate, ) class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' + _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [ { 'url': 'http://motherless.com/AC3FFE1', - 'md5': '5527fef81d2e529215dad3c2d744a7d9', + 'md5': '310f62e325a9fafe64f68c0bccb6e75f', 'info_dict': { 'id': 'AC3FFE1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fucked in the ass while playing PS3', 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', @@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg', 'age_limit': 18, } + }, + { + 'url': 'http://motherless.com/g/cosplay/633979F', + 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', + 'info_dict': { + 'id': '633979F', + 'ext': 'mp4', + 'title': 'Turtlette', + 'categories': ['superheroine heroine superher'], + 'upload_date': '20140827', + 'uploader_id': 'shade0230', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } } ] - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + def _real_extract(self, url): + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - - video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') + title = self._html_search_regex( + r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + video_url = self._html_search_regex( + r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') age_limit = self._rta_search(webpage) - - view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') + view_count = str_to_int(self._html_search_regex( + r'<strong>Views</strong>\s+([^<]+)<', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + r'<strong>Favorited</strong>\s+([^<]+)<', + webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') + upload_date = self._html_search_regex( + r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) - like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') - comment_count = webpage.count('class="media-comment-contents"') - uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') + uploader_id = self._html_search_regex( + r'"thumb-member-username">\s+<a href="/m/([^"]+)"', + webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage) if categories: @@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor): 'uploader_id': uploader_id, 'thumbnail': self._og_search_thumbnail(webpage), 'categories': categories, - 'view_count': int_or_none(view_count.replace(',', '')), - 'like_count': int_or_none(like_count.replace(',', '')), + 'view_count': view_count, + 'like_count': like_count, 'comment_count': comment_count, 'age_limit': age_limit, 'url': video_url, diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 072d9cf8e..d66c2c6f8 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -22,21 +22,23 @@ class NHLBaseInfoExtractor(InfoExtractor): self.report_extraction(video_id) initial_video_url = info['publishPoint'] - data = compat_urllib_parse.urlencode({ - 'type': 'fvod', - 'path': initial_video_url.replace('.mp4', '_sd.mp4'), - }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text + if info['formats'] == '1': + data = compat_urllib_parse.urlencode({ + 'type': 'fvod', + 'path': initial_video_url.replace('.mp4', '_sd.mp4'), + }) + path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data + path_doc = self._download_xml( + path_url, video_id, 'Downloading final video url') + video_url = path_doc.find('path').text + else: + video_url = initial_video_url join = compat_urlparse.urljoin return { 'id': video_id, 'title': info['name'], 'url': video_url, - 'ext': determine_ext(video_url), 'description': info['description'], 'duration': int(info['duration']), 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), @@ -46,10 +48,11 @@ class NHLBaseInfoExtractor(InfoExtractor): class NHLIE(NHLBaseInfoExtractor): IE_NAME = 'nhl.com' - _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9a-z-]+)' _TESTS = [{ 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', + 'md5': 'db704a4ea09e8d3988c85e36cc892d09', 'info_dict': { 'id': '453614', 'ext': 'mp4', @@ -59,6 +62,17 @@ class NHLIE(NHLBaseInfoExtractor): 'upload_date': '20131006', }, }, { + 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', + 'md5': 'd22e82bc592f52d37d24b03531ee9696', + 'info_dict': { + 'id': '2014020024-628-h', + 'ext': 'mp4', + 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', + 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', + 'duration': 0, + 'upload_date': '20141011', + }, + }, { 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', 'only_matching': True, }] diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8f140d626..6118ed5c2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -80,8 +80,14 @@ class PBSIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140122', } + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', + 'info_dict': { + 'id': 'united-states-of-secrets', + }, + 'playlist_count': 2, } - ] def _extract_webpage(self, url): @@ -96,6 +102,12 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', webpage, 'upload date', default=None)) + # tabbed frontline videos + tabbed_videos = re.findall( + r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage) + if tabbed_videos: + return tabbed_videos, presumptive_id, upload_date + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer @@ -130,6 +142,12 @@ class PBSIE(InfoExtractor): def _real_extract(self, url): video_id, display_id, upload_date = self._extract_webpage(url) + if isinstance(video_id, list): + entries = [self.url_result( + 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) + for vid_id in video_id] + return self.playlist_result(entries, display_id) + info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 4719ba45c..c77671fd3 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -40,14 +40,15 @@ class SoundcloudIE(InfoExtractor): _TESTS = [ { 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', - 'file': '62986583.mp3', 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', 'info_dict': { - "upload_date": "20121011", - "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", - "uploader": "E.T. ExTerrestrial Music", - "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1", - "duration": 143, + 'id': '62986583', + 'ext': 'mp3', + 'upload_date': '20121011', + 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', + 'uploader': 'E.T. ExTerrestrial Music', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'duration': 143, } }, # not streamable song @@ -103,7 +104,7 @@ class SoundcloudIE(InfoExtractor): 'id': '128590877', 'ext': 'mp3', 'title': 'Bus Brakes', - 'description': 'md5:0170be75dd395c96025d210d261c784e', + 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', 'upload_date': '20140109', 'duration': 17, diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 19cc976e3..b9cd35109 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -26,7 +26,6 @@ class SportBoxIE(InfoExtractor): 'timestamp': 1411896237, 'upload_date': '20140928', 'duration': 4846, - 'view_count': int, }, 'params': { # m3u8 download @@ -65,8 +64,6 @@ class SportBoxIE(InfoExtractor): r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False)) duration = parse_duration(self._html_search_regex( r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'<span>Просмотров: (\d+)</span>', player, 'view count', fatal=False)) return { 'id': video_id, @@ -76,6 +73,5 @@ class SportBoxIE(InfoExtractor): 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, - 'view_count': view_count, 'formats': formats, } diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py new file mode 100644 index 000000000..db9788c18 --- /dev/null +++ b/youtube_dl/extractor/telecinco.py @@ -0,0 +1,19 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .mitele import MiTeleIE + + +class TelecincoIE(MiTeleIE): + IE_NAME = 'telecinco.es' + _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html' + + _TEST = { + 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', + 'info_dict': { + 'id': 'MDSVID20141015_0058', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al ...', + 'duration': 662, + }, + } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 306fe8974..40c53ff17 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,9 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class TumblrIE(InfoExtractor): @@ -18,7 +15,7 @@ class TumblrIE(InfoExtractor): 'id': '54196191430', 'ext': 'mp4', 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', - 'description': 'md5:dfac39636969fe6bf1caa2d50405f069', + 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', 'thumbnail': 're:http://.*\.jpg', } }, { @@ -27,7 +24,7 @@ class TumblrIE(InfoExtractor): 'info_dict': { 'id': '90208453769', 'ext': 'mp4', - 'title': '5SOS STRUM ;)', + 'title': '5SOS STRUM ;]', 'description': 'md5:dba62ac8639482759c8eb10ce474586a', 'thumbnail': 're:http://.*\.jpg', } @@ -41,18 +38,12 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage = self._download_webpage(url, video_id) - re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) - video = re.search(re_video, webpage) - if video is None: - raise ExtractorError('Unable to extract video') - video_url = video.group('video_url') - ext = video.group('ext') - - video_thumbnail = self._search_regex( - r'posters.*?\[\\x22(.*?)\\x22', - webpage, 'thumbnail', fatal=False) # We pick the first poster - if video_thumbnail: - video_thumbnail = video_thumbnail.replace('\\\\/', '/') + iframe_url = self._search_regex( + r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', + webpage, 'iframe url') + iframe = self._download_webpage(iframe_url, video_id) + video_url = self._search_regex(r'<source src="([^"]+)"', + iframe, 'video url') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos @@ -62,9 +53,9 @@ class TumblrIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, - 'title': video_title, - 'description': self._html_search_meta('description', webpage), - 'thumbnail': video_thumbnail, - 'ext': ext, + 'url': video_url, + 'ext': 'mp4', + 'title': video_title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 9328ef4a2..0faa729c6 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -1,55 +1,85 @@ -import json -import re +from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, +) class ViddlerIE(InfoExtractor): - _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TEST = { - u"url": u"http://www.viddler.com/v/43903784", - u'file': u'43903784.mp4', - u'md5': u'fbbaedf7813e514eb7ca30410f439ac9', - u'info_dict': { - u"title": u"Video Made Easy", - u"uploader": u"viddler", - u"duration": 100.89, + "url": "http://www.viddler.com/v/43903784", + 'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4', + 'info_dict': { + 'id': '43903784', + 'ext': 'mp4', + "title": "Video Made Easy", + 'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ', + "uploader": "viddler", + 'timestamp': 1335371429, + 'upload_date': '20120425', + "duration": 100.89, + 'thumbnail': 're:^https?://.*\.jpg$', + 'view_count': int, + 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - embed_url = mobj.group('domain') + u'/embed/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - video_sources_code = self._search_regex( - r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs') - video_sources = json.loads(video_sources_code.replace("'", '"')) - - formats = [{ - 'url': video_url, - 'format': format_id, - } for video_url, format_id in video_sources.items()] - - title = self._html_search_regex( - r"title\s*:\s*'([^']*)'", webpage, u'title') - uploader = self._html_search_regex( - r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False) - duration_s = self._html_search_regex( - r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False) - duration = float(duration_s) if duration_s else None - thumbnail = self._html_search_regex( - r"thumbnail\s*:\s*'([^']*)'", - webpage, u'thumbnail', fatal=False) + video_id = self._match_id(url) + + json_url = ( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' % + video_id) + data = self._download_json(json_url, video_id)['video'] + + formats = [] + for filed in data['files']: + if filed.get('status', 'ready') != 'ready': + continue + f = { + 'format_id': filed['profile_id'], + 'format_note': filed['profile_name'], + 'url': self._proto_relative_url(filed['url']), + 'width': int_or_none(filed.get('width')), + 'height': int_or_none(filed.get('height')), + 'filesize': int_or_none(filed.get('size')), + 'ext': filed.get('ext'), + 'source_preference': -1, + } + formats.append(f) + + if filed.get('cdn_url'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['cdn_url']) + f['format_id'] = filed['profile_id'] + '-cdn' + f['source_preference'] = 1 + formats.append(f) + + if filed.get('html5_video_source'): + f = f.copy() + f['url'] = self._proto_relative_url( + filed['html5_video_source']) + f['format_id'] = filed['profile_id'] + '-html5' + f['source_preference'] = 0 + formats.append(f) + self._sort_formats(formats) + + categories = [ + t.get('text') for t in data.get('tags', []) if 'text' in t] return { '_type': 'video', 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, + 'title': data['title'], 'formats': formats, + 'description': data.get('description'), + 'timestamp': int_or_none(data.get('upload_time')), + 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), + 'uploader': data.get('author'), + 'duration': float_or_none(data.get('length')), + 'view_count': int_or_none(data.get('view_count')), + 'categories': categories, } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py new file mode 100644 index 000000000..669979e13 --- /dev/null +++ b/youtube_dl/extractor/vidzi.py @@ -0,0 +1,33 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VidziIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' + _TEST = { + 'url': 'http://vidzi.tv/cghql9yq6emu.html', + 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', + 'info_dict': { + 'id': 'cghql9yq6emu', + 'ext': 'mp4', + 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + video_url = self._html_search_regex( + r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + title = self._html_search_regex( + r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } +
\ No newline at end of file diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py new file mode 100644 index 000000000..57ef8dc30 --- /dev/null +++ b/youtube_dl/extractor/vrt.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import float_or_none + + +class VRTIE(InfoExtractor): + _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*' + _TESTS = [ + # deredactie.be + { + 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', + 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', + 'info_dict': { + 'id': '2129880', + 'ext': 'flv', + 'title': 'Het journaal L - 25/10/14', + 'description': None, + 'timestamp': 1414271750.949, + 'upload_date': '20141025', + 'duration': 929, + } + }, + # sporza.be + { + 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', + 'md5': '11f53088da9bf8e7cfc42456697953ff', + 'info_dict': { + 'id': '2124639', + 'ext': 'flv', + 'title': 'Bekijk Extra Time van 20 oktober', + 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', + 'timestamp': 1413835980.560, + 'upload_date': '20141020', + 'duration': 3238, + } + }, + # cobra.be + { + 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', + 'md5': '78a2b060a5083c4f055449a72477409d', + 'info_dict': { + 'id': '2126050', + 'ext': 'flv', + 'title': 'Bret Easton Ellis in Café Corsari', + 'description': 'md5:f699986e823f32fd6036c1855a724ee9', + 'timestamp': 1413967500.494, + 'upload_date': '20141022', + 'duration': 661, + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) + + formats = [] + mobj = re.search( + r'data-video-iphone-server="(?P<server>[^"]+)"\s+data-video-iphone-path="(?P<path>[^"]+)"', + webpage) + if mobj: + formats.extend(self._extract_m3u8_formats( + '%s/%s' % (mobj.group('server'), mobj.group('path')), + video_id, 'mp4')) + mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage) + if mobj: + formats.extend(self._extract_f4m_formats( + '%s/manifest.f4m' % mobj.group('src'), video_id)) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = float_or_none(self._search_regex( + r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) + duration = float_or_none(self._search_regex( + r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cfae2de89..4ab56e0ac 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -191,8 +191,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _real_initialize(self): if self._downloader is None: return - if not self._set_language(): - return + if self._get_login_info()[0] is not None: + if not self._set_language(): + return if not self._login(): return self._confirm_age() |