diff options
Diffstat (limited to 'youtube_dl/extractor')
61 files changed, 1314 insertions, 575 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index dd770fdf1..17ab49283 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,12 +20,14 @@ from .arte import ( ArteTVDDCIE, ArteTVEmbedIE, ) +from .audiomack import AudiomackIE from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE @@ -60,7 +62,10 @@ from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .cracked import CrackedIE from .criterion import CriterionIE -from .crunchyroll import CrunchyrollIE +from .crunchyroll import ( + CrunchyrollIE, + CrunchyrollShowPlaylistIE +) from .cspan import CSpanIE from .d8 import D8IE from .dailymotion import ( @@ -134,6 +139,7 @@ from .gamestar import GameStarIE from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .glide import GlideIE from .globo import GloboIE from .godtube import GodTubeIE from .golem import GolemIE @@ -173,7 +179,6 @@ from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE -from .justintv import JustinTVIE from .jpopsukitv import JpopsukiIE from .kankan import KankanIE from .keezmovies import KeezMoviesIE @@ -316,6 +321,7 @@ from .sbs import SBSIE from .scivee import SciVeeIE from .screencast import ScreencastIE from .servingsys import ServingSysIE +from .sexykarma import SexyKarmaIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE @@ -349,6 +355,7 @@ from .spike import SpikeIE from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE @@ -367,10 +374,12 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .telecinco import TelecincoIE from .telemb import TeleMBIE from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE +from .theonion import TheOnionIE from .theplatform import ThePlatformIE from .thesixtyone import TheSixtyOneIE from .thisav import ThisAVIE @@ -394,6 +403,7 @@ from .tutv import TutvIE from .tvigle import TvigleIE from .tvp import TvpIE from .tvplay import TVPlayIE +from .twitch import TwitchIE from .ubu import UbuIE from .udemy import ( UdemyIE, @@ -419,6 +429,7 @@ from .videopremium import VideoPremiumIE from .videott import VideoTtIE from .videoweed import VideoWeedIE from .vidme import VidmeIE +from .vidzi import VidziIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -438,6 +449,7 @@ from .viki import VikiIE from .vk import VKIE from .vodlocker import VodlockerIE from .vporn import VpornIE +from .vrt import VRTIE from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE @@ -487,10 +499,8 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) - from .zdf import ZDFIE - _ALL_CLASSES = [ klass for name, klass in globals().items() diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index c3d02f85e..b9a9440c0 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,8 +10,8 @@ from ..utils import ( unified_strdate, determine_ext, get_element_by_id, - compat_str, get_element_by_attribute, + int_or_none, ) # There are different sources of video in arte.tv, the extraction process @@ -90,15 +90,24 @@ class ArteTVPlus7IE(InfoExtractor): if not upload_date_str: upload_date_str = player_info.get('VDA', '').split(' ')[0] + title = player_info['VTI'].strip() + subtitle = player_info.get('VSU', '').strip() + if subtitle: + title += ' - %s' % subtitle + info_dict = { 'id': player_info['VID'], - 'title': player_info['VTI'], + 'title': title, 'description': player_info.get('VDE'), 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } - all_formats = player_info['VSR'].values() + all_formats = [] + for format_id, format_dict in player_info['VSR'].items(): + fmt = dict(format_dict) + fmt['format_id'] = format_id + all_formats.append(fmt) # Some formats use the m3u8 protocol all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) def _match_lang(f): @@ -149,22 +158,12 @@ class ArteTVPlus7IE(InfoExtractor): ) formats = sorted(formats, key=sort_key) def _format(format_info): - quality = '' - height = format_info.get('height') - if height is not None: - quality = compat_str(height) - bitrate = format_info.get('bitrate') - if bitrate is not None: - quality += '-%d' % bitrate - if format_info.get('versionCode') is not None: - format_id = '%s-%s' % (quality, format_info['versionCode']) - else: - format_id = quality info = { - 'format_id': format_id, - 'format_note': format_info.get('versionLibelle'), - 'width': format_info.get('width'), - 'height': height, + 'format_id': format_info['format_id'], + 'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')), + 'width': int_or_none(format_info.get('width')), + 'height': int_or_none(format_info.get('height')), + 'tbr': int_or_none(format_info.get('bitrate')), } if format_info['mediaType'] == 'rtmp': info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py new file mode 100644 index 000000000..6232d2cd0 --- /dev/null +++ b/youtube_dl/extractor/audiomack.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .soundcloud import SoundcloudIE +from ..utils import ExtractorError + +import time + + +class AudiomackIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' + IE_NAME = 'audiomack' + _TESTS = [ + #hosted on audiomack + { + 'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary', + 'info_dict': + { + 'id' : 'roosh-williams/extraordinary', + 'ext': 'mp3', + 'title': 'Roosh Williams - Extraordinary' + } + }, + #hosted on soundcloud via audiomack + { + 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', + 'file': '172419696.mp3', + 'info_dict': + { + 'ext': 'mp3', + 'title': 'Young Thug ft Lil Wayne - Take Kare', + "upload_date": "20141016", + "description": "New track produced by London On Da Track called “Take Kare\"\n\nhttp://instagram.com/theyoungthugworld\nhttps://www.facebook.com/ThuggerThuggerCashMoney\n", + "uploader": "Young Thug World" + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_response = self._download_json( + "http://www.audiomack.com/api/music/url/song/%s?_=%d" % ( + video_id, time.time()), + video_id) + + if "url" not in api_response: + raise ExtractorError("Unable to deduce api url of song") + realurl = api_response["url"] + + #Audiomack wraps a lot of soundcloud tracks in their branded wrapper + # - if so, pass the work off to the soundcloud extractor + if SoundcloudIE.suitable(realurl): + return {'_type': 'url', 'url': realurl, 'ie_key': 'Soundcloud'} + + webpage = self._download_webpage(url, video_id) + artist = self._html_search_regex( + r'<span class="artist">(.*?)</span>', webpage, "artist") + songtitle = self._html_search_regex( + r'<h1 class="profile-title song-title"><span class="artist">.*?</span>(.*?)</h1>', + webpage, "title") + title = artist + " - " + songtitle + + return { + 'id': video_id, + 'title': title, + 'url': realurl, + } diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py new file mode 100644 index 000000000..0269d1174 --- /dev/null +++ b/youtube_dl/extractor/bild.py @@ -0,0 +1,39 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class BildIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' + IE_DESC = 'Bild.de' + _TEST = { + 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', + 'md5': 'dd495cbd99f2413502a1713a1156ac8a', + 'info_dict': { + 'id': '38184146', + 'ext': 'mp4', + 'title': 'BILD hat sie getestet', + 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', + 'duration': 196, + 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" + doc = self._download_xml(xml_url, video_id) + + duration = int_or_none(doc.attrib.get('duration'), scale=1000) + + return { + 'id': video_id, + 'title': doc.attrib['ueberschrift'], + 'description': doc.attrib.get('text'), + 'url': doc.attrib['src'], + 'thumbnail': doc.attrib.get('img'), + 'duration': duration, + } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 2e277c8c3..45ba51732 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 294670386..ad22cbafd 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -87,6 +87,15 @@ class BrightcoveIE(InfoExtractor): 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', }, }, + { + # playlist test + # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', + 'info_dict': { + 'title': 'Sealife', + }, + 'playlist_mincount': 7, + }, ] @classmethod diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 496271be4..d064a28f9 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,7 +42,7 @@ class CinemassacreIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) if not mobj: raise ExtractorError('Can\'t extract embed url and video id') playerdata_url = mobj.group('embed_url') @@ -53,17 +53,22 @@ class CinemassacreIE(InfoExtractor): video_description = self._html_search_regex( r'<div class="entry-content">(?P<description>.+?)</div>', webpage, 'description', flags=re.DOTALL, fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') - video_thumbnail = self._search_regex( - r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) - sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') - videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') + vidurl = self._search_regex( + r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') + vidid = self._search_regex( + r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') + videoserver = self._html_search_regex( + r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') + + videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') formats = [] - baseurl = sd_url[:sd_url.rfind('/')+1] + baseurl = vidurl[:vidurl.rfind('/')+1] for video in videolist.findall('.//video'): src = video.get('src') if not src: diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index d4227e6eb..2edab90a3 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -4,7 +4,6 @@ import json import re from .common import InfoExtractor -from ..utils import int_or_none _translation_table = { @@ -39,9 +38,7 @@ class CliphunterIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._search_regex( diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index dae40c136..78877b1cf 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn(-ap)?|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 450c7dfd6..e1bd6bb49 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -89,6 +89,10 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. * http_referer HTTP Referer header value to set. * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers @@ -238,7 +242,6 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] @@ -247,6 +250,10 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -281,6 +288,12 @@ class InfoExtractor(object): raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) self.to_screen('Saving request to ' + filename) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if os.name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = '\\\\?\\' + absfilepath with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -299,7 +312,7 @@ class InfoExtractor(object): msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) - return (content, urlh) + return content def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the data of the page as a string """ @@ -607,12 +620,13 @@ class InfoExtractor(object): audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, + f.get('source_preference') if f.get('source_preference') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) def http_scheme(self): - """ Either "https:" or "https:", depending on the user's preferences """ + """ Either "http:" or "https:", depending on the user's preferences """ return ( 'http:' if self._downloader.params.get('prefer_insecure', False) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index ffbe4903b..7a7e79360 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -34,6 +34,8 @@ class CondeNastIE(InfoExtractor): _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) + EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed)/.+?' % '|'.join(_SITES.keys()) + _TEST = { 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', 'md5': '1921f713ed48aabd715691f774c451f7', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f99888ecc..05b21e872 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -24,6 +24,7 @@ from ..aes import ( aes_cbc_decrypt, inc, ) +from .common import InfoExtractor class CrunchyrollIE(SubtitlesInfoExtractor): @@ -39,6 +40,7 @@ class CrunchyrollIE(SubtitlesInfoExtractor): 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', + 'url': 're:(?!.*&)', }, 'params': { # rtmp @@ -237,12 +239,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) - streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) - video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') - video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') + streamdata = self._download_xml( + streamdata_req, video_id, + note='Downloading media info for %s' % video_format) + video_url = streamdata.find('.//host').text + video_play_path = streamdata.find('.//file').text formats.append({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_play_path, 'ext': 'flv', 'format': video_format, 'format_id': video_format, @@ -285,3 +289,40 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'subtitles': subtitles, 'formats': formats, } + + +class CrunchyrollShowPlaylistIE(InfoExtractor): + IE_NAME = "crunchyroll:playlist" + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$' + + _TESTS = [{ + 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', + 'info_dict': { + 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', + 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + + webpage = self._download_webpage(url, show_id) + title = self._html_search_regex( + r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>', + webpage, 'title') + episode_paths = re.findall( + r'(?s)<li id="showview_videos_media_[0-9]+"[^>]+>.*?<a href="([^"]+)"', + webpage) + entries = [ + self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll') + for ep in episode_paths + ] + entries.reverse() + + return { + '_type': 'playlist', + 'id': show_id, + 'title': title, + 'entries': entries, + } diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index c6ab6952e..3c39ca451 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,49 +1,48 @@ # encoding: utf-8 -import re +from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - determine_ext, -) class FazIE(InfoExtractor): - IE_NAME = u'faz.net' + IE_NAME = 'faz.net' _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' _TEST = { - u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', - u'file': u'12610585.mp4', - u'info_dict': { - u'title': u'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', - u'description': u'md5:1453fbf9a0d041d985a47306192ea253', + 'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', + 'info_dict': { + 'id': '12610585', + 'ext': 'mp4', + 'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', + 'description': 'md5:1453fbf9a0d041d985a47306192ea253', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - self.to_screen(video_id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, - u'config xml url') - config = self._download_xml(config_xml_url, video_id, - u'Downloading config xml') + config_xml_url = self._search_regex( + r'writeFLV\(\'(.+?)\',', webpage, 'config xml url') + config = self._download_xml( + config_xml_url, video_id, 'Downloading config xml') encodings = config.find('ENCODINGS') formats = [] - for code in ['LOW', 'HIGH', 'HQ']: + for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): encoding = encodings.find(code) if encoding is None: continue encoding_url = encoding.find('FILENAME').text formats.append({ 'url': encoding_url, - 'ext': determine_ext(encoding_url), 'format_id': code.lower(), + 'quality': pref, }) + self._sort_formats(formats) - descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description') + descr = self._html_search_regex( + r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False) return { 'id': video_id, 'title': self._og_search_title(webpage), diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 0b3374d97..35d7d15e1 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -46,7 +46,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): f4m_format['preference'] = 1 formats.extend(f4m_formats) elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -58,7 +58,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): formats.append({ 'url': video_url, 'format_id': format_id, - 'preference': 2, + 'preference': -1, }) self._sort_formats(formats) @@ -93,7 +93,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', - 'md5': '9cecf35f99c4079c199e9817882a9a1c', 'info_dict': { 'id': '84981923', 'ext': 'flv', diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index d966e8403..ec6d96ada 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -37,7 +37,7 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage) + links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage) if not links: raise ExtractorError('No media links available for %s' % video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dfc2ef4e7..35a7664b2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -28,6 +28,7 @@ from .brightcove import BrightcoveIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .smotri import SmotriIE +from .condenast import CondeNastIE class GenericIE(InfoExtractor): @@ -324,7 +325,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'age_limit': 18, 'uploader': 'www.handjobhub.com', - 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', + 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, # RSS feed @@ -379,6 +380,32 @@ class GenericIE(InfoExtractor): 'uploader': 'education-portal.com', }, }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + }, + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented' + ], + } ] def report_following_redirect(self, new_url): @@ -475,7 +502,8 @@ class GenericIE(InfoExtractor): 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: - assert ':' in default_search + if ':' not in default_search: + default_search += ':' return self.url_result(default_search + url) url, smuggled_data = unsmuggle_url(url) @@ -490,14 +518,14 @@ class GenericIE(InfoExtractor): self.to_screen('%s: Requesting header' % video_id) head_req = HEADRequest(url) - response = self._request_webpage( + head_response = self._request_webpage( head_req, video_id, note=False, errnote='Could not send HEAD request to %s' % url, fatal=False) - if response is not False: + if head_response is not False: # Check for redirect - new_url = response.geturl() + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -505,34 +533,35 @@ class GenericIE(InfoExtractor): new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) - # Check for direct link to a video - content_type = response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) - if m: - upload_date = response.headers.get('Last-Modified') - if upload_date: - upload_date = unified_strdate(upload_date) - return { - 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], - 'formats': [{ - 'format_id': m.group('format_id'), - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }], - 'upload_date': upload_date, - } + full_response = None + if head_response is False: + full_response = self._request_webpage(url, video_id) + head_response = full_response + + # Check for direct link to a video + content_type = head_response.headers.get('Content-Type', '') + m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) + if m: + upload_date = unified_strdate( + head_response.headers.get('Last-Modified')) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + 'vcodec': 'none' if m.group('type') == 'audio' else None + }], + 'upload_date': upload_date, + } if not self._downloader.params.get('test', False) and not is_intentional: self._downloader.report_warning('Falling back on generic information extractor.') - try: + if full_response: + webpage = self._webpage_read_content(full_response, url, video_id) + else: webpage = self._download_webpage(url, video_id) - except ValueError: - # since this is the last-resort InfoExtractor, if - # this error is thrown, it'll be thrown here - raise ExtractorError('Failed to download URL: %s' % url) - self.report_extraction(video_id) # Is it an RSS feed? @@ -608,13 +637,13 @@ class GenericIE(InfoExtractor): if mobj: player_url = unescapeHTML(mobj.group('url')) surl = smuggle_url(player_url, {'Referer': url}) - return self.url_result(surl, 'Vimeo') + return self.url_result(surl) # Look for embedded (swf embed) Vimeo player mobj = re.search( - r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) + r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: - return self.url_result(mobj.group(1), 'Vimeo') + return self.url_result(mobj.group(1)) # Look for embedded YouTube player matches = re.findall(r'''(?x) @@ -622,7 +651,8 @@ class GenericIE(InfoExtractor): <iframe[^>]+?src=| data-video-url=| <embed[^>]+?src=| - embedSWF\(?:\s* + embedSWF\(?:\s*| + new\s+SWFObject\( ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ @@ -651,17 +681,20 @@ class GenericIE(InfoExtractor): # Look for embedded Wistia player match = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) if match: + embed_url = self._proto_relative_url( + unescapeHTML(match.group('url'))) return { '_type': 'url_transparent', - 'url': unescapeHTML(match.group('url')), + 'url': embed_url, 'ie_key': 'Wistia', 'uploader': video_uploader, 'title': video_title, 'id': video_id, } - match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) if match: return { '_type': 'url_transparent', @@ -847,6 +880,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, + webpage) + if mobj is not None: + return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') + def check_video(vurl): vpath = compat_urlparse.urlparse(vurl).path vext = determine_ext(vpath) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py new file mode 100644 index 000000000..9561ed5fb --- /dev/null +++ b/youtube_dl/extractor/glide.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class GlideIE(InfoExtractor): + IE_DESC = 'Glide mobile video messages (glide.me)' + _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)' + _TEST = { + 'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==', + 'md5': '4466372687352851af2d131cfaa8a4c7', + 'info_dict': { + 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', + 'ext': 'mp4', + 'title': 'Damon Timm\'s Glide message', + 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'title') + video_url = self.http_scheme() + self._search_regex( + r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL') + thumbnail_url = self._search_regex( + r'<img id="video-thumbnail" src="(.*?)"', + webpage, 'thumbnail url', fatal=False) + thumbnail = ( + thumbnail_url if thumbnail_url is None + else self.http_scheme() + thumbnail_url) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 07d994b44..fcefe54cd 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -1,13 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re +import codecs from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) +from ..utils import unified_strdate class GooglePlusIE(InfoExtractor): @@ -19,74 +17,57 @@ class GooglePlusIE(InfoExtractor): 'info_dict': { 'id': 'ZButuJc6CtH', 'ext': 'flv', + 'title': '嘆きの天使 降臨', 'upload_date': '20120613', 'uploader': '井上ヨシマサ', - 'title': '嘆きの天使 降臨', } } def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') + video_id = self._match_id(url) # Step 1, Retrieve post webpage to extract further information webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - self.report_extraction(video_id) - - # Extract update date - upload_date = self._html_search_regex( + title = self._og_search_description(webpage).splitlines()[0] + upload_date = unified_strdate(self._html_search_regex( r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*> ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE) - if upload_date: - # Convert timestring to a format suitable for filename - upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") - upload_date = upload_date.strftime('%Y%m%d') - - # Extract uploader - uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', - webpage, 'uploader', fatal=False) - - # Extract title - # Get the first line for title - video_title = self._og_search_description(webpage).splitlines()[0] + webpage, 'upload date', fatal=False, flags=re.VERBOSE)) + uploader = self._html_search_regex( + r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False) # Step 2, Simulate clicking the image box to launch video DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), + video_page = self._search_regex( + r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), webpage, 'video page URL') if not video_page.startswith(DOMAIN): video_page = DOMAIN + video_page webpage = self._download_webpage(video_page, video_id, 'Downloading video page') - # Extract video links all sizes - pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' - mobj = re.findall(pattern, webpage) - if len(mobj) == 0: - raise ExtractorError('Unable to extract video links') - - # Sort in resolution - links = sorted(mobj) + def unicode_escape(s): + decoder = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4,}', + lambda m: decoder(m.group(0))[0], + s) - # Choose the lowest of the sort, i.e. highest resolution - video_url = links[-1] - # Only get the url. The resolution part in the tuple has no use anymore - video_url = video_url[-1] - # Treat escaped \u0026 style hex - try: - video_url = video_url.decode("unicode_escape") - except AttributeError: # Python 3 - video_url = bytes(video_url, 'ascii').decode('unicode-escape') + # Extract video links all sizes + formats = [{ + 'url': unicode_escape(video_url), + 'ext': 'flv', + 'width': int(width), + 'height': int(height), + } for width, height, video_url in re.findall( + r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)] + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, + 'title': title, 'uploader': uploader, 'upload_date': upload_date, - 'title': video_title, - 'ext': 'flv', + 'formats': formats, } diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index 45cca1d24..e21e57510 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -46,9 +46,9 @@ class GorillaVidIE(InfoExtractor): 'info_dict': { 'id': '3rso4kdn6f9m', 'ext': 'mp4', - 'title': 'Micro Pig piglets ready on 16th July 2009', + 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', 'thumbnail': 're:http://.*\.jpg', - }, + } }, { 'url': 'http://movpod.in/0wguyyxi1yca', 'only_matching': True, diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index 5bdd08afa..b6cc15b6f 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,37 +1,33 @@ # -*- coding: utf-8 -*- - -import re -import json +from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext + class HarkIE(InfoExtractor): - _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+' _TEST = { - u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', - u'file': u'mmbzyhkgny.mp3', - u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', - u'info_dict': { - u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", - u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', - u'duration': 11, + 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + 'md5': '6783a58491b47b92c7c1af5a77d4cbee', + 'info_dict': { + 'id': 'mmbzyhkgny', + 'ext': 'mp3', + 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', + 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + 'duration': 11, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - json_url = "http://www.hark.com/clips/%s.json" %(video_id) - info_json = self._download_webpage(json_url, video_id) - info = json.loads(info_json) - final_url = info['url'] + video_id = self._match_id(url) + data = self._download_json( + 'http://www.hark.com/clips/%s.json' % video_id, video_id) - return {'id': video_id, - 'url' : final_url, - 'title': info['name'], - 'ext': determine_ext(final_url), - 'description': info['description'], - 'thumbnail': info['image_original'], - 'duration': info['duration'], - } + return { + 'id': video_id, + 'url': data['url'], + 'title': data['name'], + 'description': data.get('description'), + 'thumbnail': data.get('image_original'), + 'duration': data.get('duration'), + } diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index f97b1e085..d41c0413f 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( get_meta_content, + int_or_none, parse_iso8601, ) @@ -28,20 +29,26 @@ class HeiseIE(InfoExtractor): 'timestamp': 1411812600, 'upload_date': '20140927', 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', + 'thumbnail': 're:https?://.*\.jpg$', } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - json_url = self._search_regex( - r'json_url:\s*"([^"]+)"', webpage, 'json URL') - config = self._download_json(json_url, video_id) + + container_id = self._search_regex( + r'<div class="videoplayerjw".*?data-container="([0-9]+)"', + webpage, 'container ID') + sequenz_id = self._search_regex( + r'<div class="videoplayerjw".*?data-sequenz="([0-9]+)"', + webpage, 'sequenz ID') + data_url = 'http://www.heise.de/videout/feed?container=%s&sequenz=%s' % (container_id, sequenz_id) + doc = self._download_xml(data_url, video_id) info = { 'id': video_id, - 'thumbnail': config.get('poster'), + 'thumbnail': self._og_search_thumbnail(webpage), 'timestamp': parse_iso8601(get_meta_content('date', webpage)), 'description': self._og_search_description(webpage), } @@ -49,32 +56,19 @@ class HeiseIE(InfoExtractor): title = get_meta_content('fulltitle', webpage) if title: info['title'] = title - elif config.get('title'): - info['title'] = config['title'] else: info['title'] = self._og_search_title(webpage) formats = [] - for t, rs in config['formats'].items(): - if not rs or not hasattr(rs, 'items'): - self._downloader.report_warning( - 'formats: {0}: no resolutions'.format(t)) - continue - - for height_str, obj in rs.items(): - format_id = '{0}_{1}'.format(t, height_str) - - if not obj or not obj.get('url'): - self._downloader.report_warning( - 'formats: {0}: no url'.format(format_id)) - continue - - formats.append({ - 'url': obj['url'], - 'format_id': format_id, - 'height': self._int(height_str, 'height'), - }) - + for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): + label = source_node.attrib['label'] + height = int_or_none(self._search_regex( + r'^(.*?_)?([0-9]+)p$', label, 'height', default=None)) + formats.append({ + 'url': source_node.attrib['file'], + 'format_note': label, + 'height': height, + }) self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 68684b997..fccc23884 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -28,13 +28,13 @@ class HowStuffWorksIE(InfoExtractor): } }, { - 'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm', + 'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm', 'info_dict': { - 'id': '553470', - 'display_id': 'deadliest-catch-jakes-farewell-pots', + 'id': '453464', + 'display_id': 'survival-zone-food-and-water-in-the-savanna', 'ext': 'mp4', - 'title': 'Deadliest Catch: Jake\'s Farewell Pots', - 'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc', + 'title': 'Survival Zone: Food and Water In the Savanna', + 'description': 'md5:7e1c89f6411434970c15fa094170c371', 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 94e7cf790..4ccf6b9b8 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -33,8 +33,7 @@ class HuffPostIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id data = self._download_json(api_url, video_id)['data'] diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py deleted file mode 100644 index 27017e89f..000000000 --- a/youtube_dl/extractor/justintv.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import json -import os -import re - -from .common import InfoExtractor -from ..utils import ( - compat_str, - ExtractorError, - formatSeconds, -) - - -class JustinTVIE(InfoExtractor): - """Information extractor for justin.tv and twitch.tv""" - # TODO: One broadcast may be split into multiple videos. The key - # 'broadcast_id' is the same for all parts, and 'broadcast_part' - # starts at 1 and increases. Can we treat all parts as one video? - - _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/ - (?: - (?P<channelid>[^/]+)| - (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| - (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) - ) - /?(?:\#.*)?$ - """ - _JUSTIN_PAGE_LIMIT = 100 - IE_NAME = 'justin.tv' - IE_DESC = 'justin.tv and twitch.tv' - _TEST = { - 'url': 'http://www.twitch.tv/thegamedevhub/b/296128360', - 'md5': 'ecaa8a790c22a40770901460af191c9a', - 'info_dict': { - 'id': '296128360', - 'ext': 'flv', - 'upload_date': '20110927', - 'uploader_id': 25114803, - 'uploader': 'thegamedevhub', - 'title': 'Beginner Series - Scripting With Python Pt.1' - } - } - - # Return count of items, list of *valid* items - def _parse_page(self, url, video_id, counter): - info_json = self._download_webpage( - url, video_id, - 'Downloading video info JSON on page %d' % counter, - 'Unable to download video info JSON %d' % counter) - - response = json.loads(info_json) - if type(response) != list: - error_text = response.get('error', 'unknown error') - raise ExtractorError('Justin.tv API: %s' % error_text) - info = [] - for clip in response: - video_url = clip['video_file_url'] - if video_url: - video_extension = os.path.splitext(video_url)[1][1:] - video_date = re.sub('-', '', clip['start_time'][:10]) - video_uploader_id = clip.get('user_id', clip.get('channel_id')) - video_id = clip['id'] - video_title = clip.get('title', video_id) - info.append({ - 'id': compat_str(video_id), - 'url': video_url, - 'title': video_title, - 'uploader': clip.get('channel_name', video_uploader_id), - 'uploader_id': video_uploader_id, - 'upload_date': video_date, - 'ext': video_extension, - }) - return (len(response), info) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - api_base = 'http://api.justin.tv' - paged = False - if mobj.group('channelid'): - paged = True - video_id = mobj.group('channelid') - api = api_base + '/channel/archives/%s.json' % video_id - elif mobj.group('chapterid'): - chapter_id = mobj.group('chapterid') - - webpage = self._download_webpage(url, chapter_id) - m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) - if not m: - raise ExtractorError('Cannot find archive of a chapter') - archive_id = m.group(1) - - api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id - doc = self._download_xml( - api, chapter_id, - note='Downloading chapter information', - errnote='Chapter information download failed') - for a in doc.findall('.//archive'): - if archive_id == a.find('./id').text: - break - else: - raise ExtractorError('Could not find chapter in chapter information') - - video_url = a.find('./video_file_url').text - video_ext = video_url.rpartition('.')[2] or 'flv' - - chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id - chapter_info = self._download_json( - chapter_api_url, 'c' + chapter_id, - note='Downloading chapter metadata', - errnote='Download of chapter metadata failed') - - bracket_start = int(doc.find('.//bracket_start').text) - bracket_end = int(doc.find('.//bracket_end').text) - - # TODO determine start (and probably fix up file) - # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 - #video_url += '?start=' + TODO:start_timestamp - # bracket_start is 13290, but we want 51670615 - self._downloader.report_warning('Chapter detected, but we can just download the whole file. ' - 'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) - - info = { - 'id': 'c' + chapter_id, - 'url': video_url, - 'ext': video_ext, - 'title': chapter_info['title'], - 'thumbnail': chapter_info['preview'], - 'description': chapter_info['description'], - 'uploader': chapter_info['channel']['display_name'], - 'uploader_id': chapter_info['channel']['name'], - } - return info - else: - video_id = mobj.group('videoid') - api = api_base + '/broadcast/by_archive/%s.json' % video_id - - entries = [] - offset = 0 - limit = self._JUSTIN_PAGE_LIMIT - for counter in itertools.count(1): - page_url = api + ('?offset=%d&limit=%d' % (offset, limit)) - page_count, page_info = self._parse_page( - page_url, video_id, counter) - entries.extend(page_info) - if not paged or page_count != limit: - break - offset += limit - return { - '_type': 'playlist', - 'id': video_id, - 'entries': entries, - } diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 56a76380c..827091e60 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -1,8 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -21,22 +19,17 @@ class KickStarterIE(InfoExtractor): }, { 'note': 'Embedded video (not using the native kickstarter video service)', 'url': 'https://www.kickstarter.com/projects/597507018/pebble-e-paper-watch-for-iphone-and-android/posts/659178', - 'playlist': [ - { - 'info_dict': { - 'id': '78704821', - 'ext': 'mp4', - 'uploader_id': 'pebble', - 'uploader': 'Pebble Technology', - 'title': 'Pebble iOS Notifications', - } - } - ], + 'info_dict': { + 'id': '78704821', + 'ext': 'mp4', + 'uploader_id': 'pebble', + 'uploader': 'Pebble Technology', + 'title': 'Pebble iOS Notifications', + } }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 5341ac773..8a73ecfa0 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -34,7 +34,7 @@ class KontrTubeIE(InfoExtractor): video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL') thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False) title = self._html_search_regex( - r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title') + r'<title>(.+?)</title>', webpage, 'video title') description = self._html_search_meta('description', webpage, 'video description') mobj = re.search( diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 516147417..363a12ad0 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -190,7 +190,8 @@ class LivestreamOriginalIE(InfoExtractor): 'id': video_id, 'title': item.find('title').text, 'url': 'rtmp://extondemand.livestream.com/ondemand', - 'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path), + 'play_path': 'trans/dv15/mogulus-{0}'.format(path), + 'player_url': 'http://static.livestream.com/chromelessPlayer/v21/playerapi.swf?hash=5uetk&v=0803&classid=D27CDB6E-AE6D-11cf-96B8-444553540000&jsEnabled=false&wmode=opaque', 'ext': 'flv', 'thumbnail': thumbnail_url, } diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index fca0bfef0..db5df4078 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -22,7 +22,7 @@ class LRTIE(InfoExtractor): 'id': '54391', 'ext': 'mp4', 'title': 'Septynios Kauno dienos', - 'description': 'Kauno miesto ir apskrities naujienos', + 'description': 'md5:24d84534c7dc76581e59f5689462411a', 'duration': 1783, }, 'params': { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 979f3d692..6691521e5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -6,6 +6,7 @@ import json from .common import InfoExtractor from ..utils import ( compat_urllib_parse, + compat_urlparse, get_element_by_attribute, parse_duration, strip_jsonp, @@ -39,13 +40,21 @@ class MiTeleIE(InfoExtractor): ).replace('\'', '"') embed_data = json.loads(embed_data_json) - info_url = embed_data['flashvars']['host'] + domain = embed_data['mediaUrl'] + if not domain.startswith('http'): + # only happens in telecinco.es videos + domain = 'http://' + domain + info_url = compat_urlparse.urljoin( + domain, + compat_urllib_parse.unquote(embed_data['flashvars']['host']) + ) info_el = self._download_xml(info_url, episode).find('./video/info') video_link = info_el.find('videoUrl/link').text token_query = compat_urllib_parse.urlencode({'id': video_link}) token_info = self._download_json( - 'http://token.mitele.es/?' + token_query, episode, + embed_data['flashvars']['ov_tk'] + '?' + token_query, + episode, transform_source=strip_jsonp ) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 520f27fca..bb8937c4d 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -33,22 +33,22 @@ class MixcloudIE(InfoExtractor): }, } - def check_urls(self, url_list): - """Returns 1st active url from list""" - for url in url_list: + def _get_url(self, track_id, template_url): + server_count = 30 + for i in range(server_count): + url = template_url % i try: # We only want to know if the request succeed # don't download the whole file - self._request_webpage(HEADRequest(url), None, False) + self._request_webpage( + HEADRequest(url), track_id, + 'Checking URL %d/%d ...' % (i + 1, server_count + 1)) return url except ExtractorError: - url = None + pass return None - def _get_url(self, template_url): - return self.check_urls(template_url % i for i in range(30)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) @@ -61,16 +61,16 @@ class MixcloudIE(InfoExtractor): r'\s(?:data-preview-url|m-preview)="(.+?)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self._get_url(template_url) + final_song_url = self._get_url(track_id, template_url) if final_song_url is None: self.to_screen('Trying with m4a extension') template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - final_song_url = self._get_url(template_url) + final_song_url = self._get_url(track_id, template_url) if final_song_url is None: raise ExtractorError('Unable to extract track url') PREFIX = ( - r'<div class="cloudcast-play-button-container"' + r'<div class="cloudcast-play-button-container[^"]*?"' r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') title = self._html_search_regex( PREFIX + r'm-title="([^"]+)"', webpage, 'title') diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6229b2173..3621ff99e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,20 +5,20 @@ import re from .common import InfoExtractor from ..utils import ( - int_or_none, + str_to_int, unified_strdate, ) class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' + _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [ { 'url': 'http://motherless.com/AC3FFE1', - 'md5': '5527fef81d2e529215dad3c2d744a7d9', + 'md5': '310f62e325a9fafe64f68c0bccb6e75f', 'info_dict': { 'id': 'AC3FFE1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fucked in the ass while playing PS3', 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', @@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg', 'age_limit': 18, } + }, + { + 'url': 'http://motherless.com/g/cosplay/633979F', + 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', + 'info_dict': { + 'id': '633979F', + 'ext': 'mp4', + 'title': 'Turtlette', + 'categories': ['superheroine heroine superher'], + 'upload_date': '20140827', + 'uploader_id': 'shade0230', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } } ] - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + def _real_extract(self, url): + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - - video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') + title = self._html_search_regex( + r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + video_url = self._html_search_regex( + r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') age_limit = self._rta_search(webpage) - - view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') + view_count = str_to_int(self._html_search_regex( + r'<strong>Views</strong>\s+([^<]+)<', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + r'<strong>Favorited</strong>\s+([^<]+)<', + webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') + upload_date = self._html_search_regex( + r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) - like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') - comment_count = webpage.count('class="media-comment-contents"') - uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') + uploader_id = self._html_search_regex( + r'"thumb-member-username">\s+<a href="/m/([^"]+)"', + webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage) if categories: @@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor): 'uploader_id': uploader_id, 'thumbnail': self._og_search_thumbnail(webpage), 'categories': categories, - 'view_count': int_or_none(view_count.replace(',', '')), - 'like_count': int_or_none(like_count.replace(',', '')), + 'view_count': view_count, + 'like_count': like_count, 'comment_count': comment_count, 'age_limit': age_limit, 'url': video_url, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e75ab7c39..7b5449031 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -26,8 +26,7 @@ class NBCIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url') if theplatform_url.startswith('//'): @@ -57,7 +56,7 @@ class NBCNewsIE(InfoExtractor): 'md5': 'b2421750c9f260783721d898f4c42063', 'info_dict': { 'id': 'I1wpAI_zmhsQ', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', }, @@ -97,6 +96,8 @@ class NBCNewsIE(InfoExtractor): ] for base_url in base_urls: + if not base_url: + continue playlist_url = base_url + '?form=MPXNBCNewsAPI' all_videos = self._download_json(playlist_url, title)['videos'] diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 94d5ba982..add4b3e5d 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -18,16 +18,16 @@ class NDRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html', - 'md5': '4a4eeafd17c3058b65f0c8f091355855', + 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', + 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', 'note': 'Video file', 'info_dict': { - 'id': '325', + 'id': '25866', 'ext': 'mp4', - 'title': 'Blaue Bohnen aus Blocken', - 'description': 'md5:190d71ba2ccddc805ed01547718963bc', - 'duration': 1715, - }, + 'title': 'Kartoffeltage in der Lewitz', + 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', + 'duration': 166, + } }, { 'url': 'http://www.ndr.de/info/audio51535.html', diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 072d9cf8e..82af6e330 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( compat_urlparse, compat_urllib_parse, - determine_ext, unified_strdate, ) @@ -22,21 +21,23 @@ class NHLBaseInfoExtractor(InfoExtractor): self.report_extraction(video_id) initial_video_url = info['publishPoint'] - data = compat_urllib_parse.urlencode({ - 'type': 'fvod', - 'path': initial_video_url.replace('.mp4', '_sd.mp4'), - }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text + if info['formats'] == '1': + data = compat_urllib_parse.urlencode({ + 'type': 'fvod', + 'path': initial_video_url.replace('.mp4', '_sd.mp4'), + }) + path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data + path_doc = self._download_xml( + path_url, video_id, 'Downloading final video url') + video_url = path_doc.find('path').text + else: + video_url = initial_video_url join = compat_urlparse.urljoin return { 'id': video_id, 'title': info['name'], 'url': video_url, - 'ext': determine_ext(video_url), 'description': info['description'], 'duration': int(info['duration']), 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), @@ -46,10 +47,11 @@ class NHLBaseInfoExtractor(InfoExtractor): class NHLIE(NHLBaseInfoExtractor): IE_NAME = 'nhl.com' - _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9a-z-]+)' _TESTS = [{ 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', + 'md5': 'db704a4ea09e8d3988c85e36cc892d09', 'info_dict': { 'id': '453614', 'ext': 'mp4', @@ -59,6 +61,17 @@ class NHLIE(NHLBaseInfoExtractor): 'upload_date': '20131006', }, }, { + 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', + 'md5': 'd22e82bc592f52d37d24b03531ee9696', + 'info_dict': { + 'id': '2014020024-628-h', + 'ext': 'mp4', + 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', + 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', + 'duration': 0, + 'upload_date': '20141011', + }, + }, { 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', 'only_matching': True, }] diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index c0c139b5d..7b85589b7 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -39,18 +39,17 @@ class NiconicoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' - # Determine whether the downloader uses authentication to download video - _AUTHENTICATE = False + # Determine whether the downloader used authentication to download video + _AUTHENTICATED = False def _real_initialize(self): - if self._downloader.params.get('username', None) is not None: - self._AUTHENTICATE = True - - if self._AUTHENTICATE: - self._login() + self._login() def _login(self): (username, password) = self._get_login_info() + # No authentication to be performed + if not username: + return True # Log in login_form_strs = { @@ -68,6 +67,8 @@ class NiconicoIE(InfoExtractor): if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False + # Successful login + self._AUTHENTICATED = True return True def _real_extract(self, url): @@ -82,7 +83,7 @@ class NiconicoIE(InfoExtractor): 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note='Downloading video info page') - if self._AUTHENTICATE: + if self._AUTHENTICATED: # Get flv info flv_info_webpage = self._download_webpage( 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8f140d626..6118ed5c2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -80,8 +80,14 @@ class PBSIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140122', } + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', + 'info_dict': { + 'id': 'united-states-of-secrets', + }, + 'playlist_count': 2, } - ] def _extract_webpage(self, url): @@ -96,6 +102,12 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', webpage, 'upload date', default=None)) + # tabbed frontline videos + tabbed_videos = re.findall( + r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage) + if tabbed_videos: + return tabbed_videos, presumptive_id, upload_date + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer @@ -130,6 +142,12 @@ class PBSIE(InfoExtractor): def _real_extract(self, url): video_id, display_id, upload_date = self._extract_webpage(url) + if isinstance(video_id, list): + entries = [self.url_result( + 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) + for vid_id in video_id] + return self.playlist_result(entries, display_id) + info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 4118ee956..618e8f5dd 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -16,13 +16,14 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))' + _VALID_URL = r'^https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)' _TEST = { 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', - 'file': '648719015.mp4', 'md5': '882f488fa1f0026f023f33576004a2ed', 'info_dict': { - "uploader": "BABES-COM", + 'id': '648719015', + 'ext': 'mp4', + "uploader": "Babes", "title": "Seductive Indian beauty strips down and fingers her pink pussy", "age_limit": 18 } @@ -35,9 +36,7 @@ class PornHubIE(InfoExtractor): return count def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - url = 'http://www.' + mobj.group('url') + video_id = self._match_id(url) req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') @@ -45,7 +44,7 @@ class PornHubIE(InfoExtractor): video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) if thumbnail: diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index 463e85501..7fcde086c 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -14,7 +14,6 @@ from ..utils import ( class PromptFileIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P<id>[0-9A-Z\-]+)' - _FILE_NOT_FOUND_REGEX = r'<div.+id="not_found_msg".+>.+</div>[^-]' _TEST = { 'url': 'http://www.promptfile.com/l/D21B4746E9-F01462F0FF', 'md5': 'd1451b6302da7215485837aaea882c4c', @@ -27,11 +26,10 @@ class PromptFileIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: + if re.search(r'<div.+id="not_found_msg".+>(?!We are).+</div>[^-]', webpage) is not None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index a45884b25..1a41cbe40 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -81,7 +81,7 @@ class RTLnowIE(InfoExtractor): 'id': '99205', 'ext': 'flv', 'title': 'Medicopter 117 - Angst!', - 'description': 'md5:895b1df01639b5f61a04fc305a5cb94d', + 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin', 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', 'upload_date': '20080928', 'duration': 2691, diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 55b58e5e6..0e470e73f 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -21,19 +19,20 @@ class RUHDIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'<param name="src" value="([^"]+)"', webpage, 'video url') title = self._html_search_regex( - r'<title>([^<]+) RUHD.ru - Видео Высокого качества №1 в России!</title>', webpage, 'title') + r'<title>([^<]+) RUHD.ru - Видео Высокого качества №1 в России!</title>', + webpage, 'title') description = self._html_search_regex( - r'(?s)<div id="longdesc">(.+?)<span id="showlink">', webpage, 'description', fatal=False) + r'(?s)<div id="longdesc">(.+?)<span id="showlink">', + webpage, 'description', fatal=False) thumbnail = self._html_search_regex( - r'<param name="previewImage" value="([^"]+)"', webpage, 'thumbnail', fatal=False) + r'<param name="previewImage" value="([^"]+)"', + webpage, 'thumbnail', fatal=False) if thumbnail: thumbnail = 'http://www.ruhd.ru' + thumbnail diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py new file mode 100644 index 000000000..c833fc8ee --- /dev/null +++ b/youtube_dl/extractor/sexykarma.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + parse_duration, + int_or_none, +) + + +class SexyKarmaIE(InfoExtractor): + IE_DESC = 'Sexy Karma and Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', + 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', + 'info_dict': { + 'id': 'yHI70cOyIHt', + 'display_id': 'taking-a-quick-pee', + 'ext': 'mp4', + 'title': 'Taking a quick pee.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'wildginger7', + 'upload_date': '20141007', + 'duration': 22, + 'view_count': int, + 'comment_count': int, + 'categories': list, + } + }, { + 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', + 'md5': 'dd216c68d29b49b12842b9babe762a5d', + 'info_dict': { + 'id': '8Id6EZPbuHf', + 'display_id': 'pot-pixie-tribute', + 'ext': 'mp4', + 'title': 'pot_pixie tribute', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'banffite', + 'upload_date': '20141013', + 'duration': 16, + 'view_count': int, + 'comment_count': int, + 'categories': list, + } + }, { + 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', + 'md5': '9afb80675550406ed9a63ac2819ef69d', + 'info_dict': { + 'id': 'dW2mtctxJfs', + 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', + 'ext': 'mp4', + 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Don', + 'upload_date': '20140213', + 'duration': 83, + 'view_count': int, + 'comment_count': int, + 'categories': list, + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._html_search_regex( + r"url: escape\('([^']+)'\)", webpage, 'url') + + title = self._html_search_regex( + r'<h2 class="he2"><span>(.*?)</span>', + webpage, 'title') + thumbnail = self._html_search_regex( + r'<span id="container"><img\s+src="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + uploader = self._html_search_regex( + r'class="aupa">\s*(.*?)</a>', + webpage, 'uploader') + upload_date = unified_strdate(self._html_search_regex( + r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) + + duration = parse_duration(self._search_regex( + r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'comment count', fatal=False)) + + categories = re.findall( + r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', + webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 4719ba45c..54256e1a2 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -40,14 +40,15 @@ class SoundcloudIE(InfoExtractor): _TESTS = [ { 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', - 'file': '62986583.mp3', 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', 'info_dict': { - "upload_date": "20121011", - "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", - "uploader": "E.T. ExTerrestrial Music", - "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1", - "duration": 143, + 'id': '62986583', + 'ext': 'mp3', + 'upload_date': '20121011', + 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', + 'uploader': 'E.T. ExTerrestrial Music', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'duration': 143, } }, # not streamable song @@ -103,7 +104,7 @@ class SoundcloudIE(InfoExtractor): 'id': '128590877', 'ext': 'mp3', 'title': 'Bus Brakes', - 'description': 'md5:0170be75dd395c96025d210d261c784e', + 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', 'upload_date': '20140109', 'duration': 17, @@ -140,6 +141,7 @@ class SoundcloudIE(InfoExtractor): 'description': info['description'], 'thumbnail': thumbnail, 'duration': int_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), } formats = [] if info.get('downloadable', False): diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 19cc976e3..becdf658f 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, - int_or_none, ) @@ -26,7 +25,6 @@ class SportBoxIE(InfoExtractor): 'timestamp': 1411896237, 'upload_date': '20140928', 'duration': 4846, - 'view_count': int, }, 'params': { # m3u8 download @@ -65,8 +63,6 @@ class SportBoxIE(InfoExtractor): r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False)) duration = parse_duration(self._html_search_regex( r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'<span>Просмотров: (\d+)</span>', player, 'view count', fatal=False)) return { 'id': video_id, @@ -76,6 +72,5 @@ class SportBoxIE(InfoExtractor): 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, - 'view_count': view_count, 'formats': formats, } diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py new file mode 100644 index 000000000..d92d14d65 --- /dev/null +++ b/youtube_dl/extractor/srmediathek.py @@ -0,0 +1,43 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import js_to_json + + +class SRMediathekIE(InfoExtractor): + IE_DESC = 'Süddeutscher Rundfunk' + _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', + 'info_dict': { + 'id': '28455', + 'ext': 'mp4', + 'title': 'sportarena (26.10.2014)', + 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + urls = json.loads(js_to_json(self._search_regex( + r'var mediaURLs\s*=\s*(.*?);\n', webpage, 'video URLs'))) + formats = [{'url': url} for url in urls] + self._sort_formats(formats) + + title = json.loads(js_to_json(self._search_regex( + r'var mediaTitles\s*=\s*(.*?);\n', webpage, 'title')))[0] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index f76b6e2b2..5ca079f88 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -10,7 +10,6 @@ class SyfyIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', - 'md5': 'e07de1d52c7278adbb9b9b1c93a66849', 'info_dict': { 'id': 'NmqMrGnXvmO1', 'ext': 'flv', diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index d5e28efad..cd4af96fd 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -65,6 +65,22 @@ class TEDIE(SubtitlesInfoExtractor): 'title': 'Who are the hackers?', }, 'playlist_mincount': 6, + }, { + # contains a youtube video + 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_ZG8HBuDjgc', + 'ext': 'mp4', + 'title': 'Douglas Adams: Parrots the Universe and Everything', + 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', + 'uploader': 'University of California Television (UCTV)', + 'uploader_id': 'UCtelevision', + 'upload_date': '20080522', + }, + 'params': { + 'skip_download': True, + }, }] _NATIVE_FORMATS = { @@ -114,6 +130,13 @@ class TEDIE(SubtitlesInfoExtractor): talk_info = self._extract_info(webpage)['talks'][0] + if talk_info.get('external') is not None: + self.to_screen('Found video from %s' % talk_info['external']['service']) + return { + '_type': 'url', + 'url': talk_info['external']['uri'], + } + formats = [{ 'url': format_url, 'format_id': format_id, diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py new file mode 100644 index 000000000..db9788c18 --- /dev/null +++ b/youtube_dl/extractor/telecinco.py @@ -0,0 +1,19 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .mitele import MiTeleIE + + +class TelecincoIE(MiTeleIE): + IE_NAME = 'telecinco.es' + _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html' + + _TEST = { + 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', + 'info_dict': { + 'id': 'MDSVID20141015_0058', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al ...', + 'duration': 662, + }, + } diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py new file mode 100644 index 000000000..b65d8e03f --- /dev/null +++ b/youtube_dl/extractor/theonion.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class TheOnionIE(InfoExtractor): + _VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?' + _TEST = { + 'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/', + 'md5': '19eaa9a39cf9b9804d982e654dc791ee', + 'info_dict': { + 'id': '2133', + 'ext': 'mp4', + 'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image', + 'description': 'md5:cc12448686b5600baae9261d3e180910', + 'thumbnail': 're:^https?://.*\.jpg\?\d+$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + article_id = mobj.group('article_id') + + webpage = self._download_webpage(url, article_id) + + video_id = self._search_regex( + r'"videoId":\s(\d+),', webpage, 'video ID') + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage) + if not sources: + raise ExtractorError( + 'No sources found for video %s' % video_id, expected=True) + + formats = [] + for src, type_ in sources: + if type_ == 'video/mp4': + formats.append({ + 'format_id': 'mp4_sd', + 'preference': 1, + 'url': src, + }) + elif type_ == 'video/webm': + formats.append({ + 'format_id': 'webm_sd', + 'preference': 0, + 'url': src, + }) + elif type_ == 'application/x-mpegURL': + formats.extend( + self._extract_m3u8_formats(src, video_id, preference=-1)) + else: + self.report_warning( + 'Encountered unexpected format: %s' % type_) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 0be793b1c..a04925633 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -6,6 +6,7 @@ import json from .common import InfoExtractor from ..utils import ( compat_str, + determine_ext, ExtractorError, xpath_with_ns, ) @@ -34,10 +35,21 @@ class ThePlatformIE(InfoExtractor): 'skip_download': True, }, } + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + if mobj.group('config'): + config_url = url+ '&form=json' + config_url = config_url.replace('swf/', 'config/') + config_url = config_url.replace('onsite/', 'onsite/config/') + config = self._download_json(config_url, video_id, 'Downloading config') + smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' + else: + smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' + 'format=smil&mbr=true'.format(video_id)) - def _get_info(self, video_id, smil_url): - meta = self._download_xml(smil_url, video_id) + meta = self._download_xml(smil_url, video_id) try: error_msg = next( n.attrib['abstract'] @@ -89,10 +101,14 @@ class ThePlatformIE(InfoExtractor): for f in switch.findall(_x('smil:video')): attr = f.attrib vbr = int(attr['system-bitrate']) // 1000 + ext = determine_ext(attr['src']) + if ext == 'once': + ext = 'mp4' formats.append({ 'format_id': compat_str(vbr), 'url': attr['src'], 'vbr': vbr, + 'ext': ext, }) self._sort_formats(formats) @@ -104,17 +120,3 @@ class ThePlatformIE(InfoExtractor): 'thumbnail': info['defaultThumbnailUrl'], 'duration': info['duration']//1000, } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if mobj.group('config'): - config_url = url+ '&form=json' - config_url = config_url.replace('swf/', 'config/') - config_url = config_url.replace('onsite/', 'onsite/config/') - config = self._download_json(config_url, video_id, 'Downloading config') - smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' - else: - smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' - 'format=smil&mbr=true'.format(video_id)) - return self._get_info(video_id, smil_url) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 306fe8974..40c53ff17 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,9 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class TumblrIE(InfoExtractor): @@ -18,7 +15,7 @@ class TumblrIE(InfoExtractor): 'id': '54196191430', 'ext': 'mp4', 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', - 'description': 'md5:dfac39636969fe6bf1caa2d50405f069', + 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', 'thumbnail': 're:http://.*\.jpg', } }, { @@ -27,7 +24,7 @@ class TumblrIE(InfoExtractor): 'info_dict': { 'id': '90208453769', 'ext': 'mp4', - 'title': '5SOS STRUM ;)', + 'title': '5SOS STRUM ;]', 'description': 'md5:dba62ac8639482759c8eb10ce474586a', 'thumbnail': 're:http://.*\.jpg', } @@ -41,18 +38,12 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage = self._download_webpage(url, video_id) - re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) - video = re.search(re_video, webpage) - if video is None: - raise ExtractorError('Unable to extract video') - video_url = video.group('video_url') - ext = video.group('ext') - - video_thumbnail = self._search_regex( - r'posters.*?\[\\x22(.*?)\\x22', - webpage, 'thumbnail', fatal=False) # We pick the first poster - if video_thumbnail: - video_thumbnail = video_thumbnail.replace('\\\\/', '/') + iframe_url = self._search_regex( + r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', + webpage, 'iframe url') + iframe = self._download_webpage(iframe_url, video_id) + video_url = self._search_regex(r'<source src="([^"]+)"', + iframe, 'video url') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos @@ -62,9 +53,9 @@ class TumblrIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, - 'title': video_title, - 'description': self._html_search_meta('description', webpage), - 'thumbnail': video_thumbnail, - 'ext': ext, + 'url': video_url, + 'ext': 'mp4', + 'title': video_title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py new file mode 100644 index 000000000..36aa1ad6e --- /dev/null +++ b/youtube_dl/extractor/twitch.py @@ -0,0 +1,187 @@ +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) + + +class TwitchIE(InfoExtractor): + # TODO: One broadcast may be split into multiple videos. The key + # 'broadcast_id' is the same for all parts, and 'broadcast_part' + # starts at 1 and increases. Can we treat all parts as one video? + _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/ + (?: + (?P<channelid>[^/]+)| + (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| + (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) + ) + /?(?:\#.*)?$ + """ + _PAGE_LIMIT = 100 + _API_BASE = 'https://api.twitch.tv' + _TESTS = [{ + 'url': 'http://www.twitch.tv/riotgames/b/577357806', + 'info_dict': { + 'id': 'a577357806', + 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.twitch.tv/acracingleague/c/5285812', + 'info_dict': { + 'id': 'c5285812', + 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://www.twitch.tv/vanillatv', + 'info_dict': { + 'id': 'vanillatv', + 'title': 'VanillaTV', + }, + 'playlist_mincount': 412, + }] + + def _handle_error(self, response): + if not isinstance(response, dict): + return + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), + expected=True) + + def _download_json(self, url, video_id, note='Downloading JSON metadata'): + response = super(TwitchIE, self)._download_json(url, video_id, note) + self._handle_error(response) + return response + + def _extract_media(self, item, item_id): + ITEMS = { + 'a': 'video', + 'c': 'chapter', + } + info = self._extract_info(self._download_json( + '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + 'Downloading %s info JSON' % ITEMS[item])) + response = self._download_json( + '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + 'Downloading %s playlist JSON' % ITEMS[item]) + entries = [] + chunks = response['chunks'] + qualities = list(chunks.keys()) + for num, fragment in enumerate(zip(*chunks.values()), start=1): + formats = [] + for fmt_num, fragment_fmt in enumerate(fragment): + format_id = qualities[fmt_num] + fmt = { + 'url': fragment_fmt['url'], + 'format_id': format_id, + 'quality': 1 if format_id == 'live' else 0, + } + m = re.search(r'^(?P<height>\d+)[Pp]', format_id) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) + entry = dict(info) + entry['id'] = '%s_%d' % (entry['id'], num) + entry['title'] = '%s part %d' % (entry['title'], num) + entry['formats'] = formats + entries.append(entry) + return self.playlist_result(entries, info['id'], info['title']) + + def _extract_info(self, info): + return { + 'id': info['_id'], + 'title': info['title'], + 'description': info['description'], + 'duration': info['length'], + 'thumbnail': info['preview'], + 'uploader': info['channel']['display_name'], + 'uploader_id': info['channel']['name'], + 'timestamp': parse_iso8601(info['recorded_at']), + 'view_count': info['views'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj.group('chapterid'): + return self._extract_media('c', mobj.group('chapterid')) + + """ + webpage = self._download_webpage(url, chapter_id) + m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) + if not m: + raise ExtractorError('Cannot find archive of a chapter') + archive_id = m.group(1) + + api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id + doc = self._download_xml( + api, chapter_id, + note='Downloading chapter information', + errnote='Chapter information download failed') + for a in doc.findall('.//archive'): + if archive_id == a.find('./id').text: + break + else: + raise ExtractorError('Could not find chapter in chapter information') + + video_url = a.find('./video_file_url').text + video_ext = video_url.rpartition('.')[2] or 'flv' + + chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id + chapter_info = self._download_json( + chapter_api_url, 'c' + chapter_id, + note='Downloading chapter metadata', + errnote='Download of chapter metadata failed') + + bracket_start = int(doc.find('.//bracket_start').text) + bracket_end = int(doc.find('.//bracket_end').text) + + # TODO determine start (and probably fix up file) + # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 + #video_url += '?start=' + TODO:start_timestamp + # bracket_start is 13290, but we want 51670615 + self._downloader.report_warning('Chapter detected, but we can just download the whole file. ' + 'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) + + info = { + 'id': 'c' + chapter_id, + 'url': video_url, + 'ext': video_ext, + 'title': chapter_info['title'], + 'thumbnail': chapter_info['preview'], + 'description': chapter_info['description'], + 'uploader': chapter_info['channel']['display_name'], + 'uploader_id': chapter_info['channel']['name'], + } + return info + """ + elif mobj.group('videoid'): + return self._extract_media('a', mobj.group('videoid')) + elif mobj.group('channelid'): + channel_id = mobj.group('channelid') + info = self._download_json( + '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + channel_id, 'Downloading channel info JSON') + channel_name = info.get('display_name') or info.get('name') + entries = [] + offset = 0 + limit = self._PAGE_LIMIT + for counter in itertools.count(1): + response = self._download_json( + '%s/kraken/channels/%s/videos/?offset=%d&limit=%d' + % (self._API_BASE, channel_id, offset, limit), + channel_id, 'Downloading channel videos JSON page %d' % counter) + videos = response['videos'] + if not videos: + break + entries.extend([self.url_result(video['url'], 'Twitch') for video in videos]) + offset += limit + return self.playlist_result(entries, channel_id, channel_name) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 994b60a76..cee1ea8f6 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -72,7 +72,7 @@ class UstreamChannelIE(InfoExtractor): 'info_dict': { 'id': '10874166', }, - 'playlist_mincount': 54, + 'playlist_mincount': 17, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 964470070..d3fa70e0e 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -17,7 +17,7 @@ class VGTVIE(InfoExtractor): 'info_dict': { 'id': '84196', 'ext': 'mp4', - 'title': 'Hevnen er søt episode 10: Abu', + 'title': 'Hevnen er søt episode 1:10 - Abu', 'description': 'md5:e25e4badb5f544b04341e14abdc72234', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 648.000, @@ -67,9 +67,7 @@ class VGTVIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) data = self._download_json( 'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id, video_id, 'Downloading media JSON') diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 9328ef4a2..0faa729c6 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -1,55 +1,85 @@ -import json -import re +from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, +) class ViddlerIE(InfoExtractor): - _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TEST = { - u"url": u"http://www.viddler.com/v/43903784", - u'file': u'43903784.mp4', - u'md5': u'fbbaedf7813e514eb7ca30410f439ac9', - u'info_dict': { - u"title": u"Video Made Easy", - u"uploader": u"viddler", - u"duration": 100.89, + "url": "http://www.viddler.com/v/43903784", + 'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4', + 'info_dict': { + 'id': '43903784', + 'ext': 'mp4', + "title": "Video Made Easy", + 'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ', + "uploader": "viddler", + 'timestamp': 1335371429, + 'upload_date': '20120425', + "duration": 100.89, + 'thumbnail': 're:^https?://.*\.jpg$', + 'view_count': int, + 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - embed_url = mobj.group('domain') + u'/embed/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - video_sources_code = self._search_regex( - r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs') - video_sources = json.loads(video_sources_code.replace("'", '"')) - - formats = [{ - 'url': video_url, - 'format': format_id, - } for video_url, format_id in video_sources.items()] - - title = self._html_search_regex( - r"title\s*:\s*'([^']*)'", webpage, u'title') - uploader = self._html_search_regex( - r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False) - duration_s = self._html_search_regex( - r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False) - duration = float(duration_s) if duration_s else None - thumbnail = self._html_search_regex( - r"thumbnail\s*:\s*'([^']*)'", - webpage, u'thumbnail', fatal=False) + video_id = self._match_id(url) + + json_url = ( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' % + video_id) + data = self._download_json(json_url, video_id)['video'] + + formats = [] + for filed in data['files']: + if filed.get('status', 'ready') != 'ready': + continue + f = { + 'format_id': filed['profile_id'], + 'format_note': filed['profile_name'], + 'url': self._proto_relative_url(filed['url']), + 'width': int_or_none(filed.get('width')), + 'height': int_or_none(filed.get('height')), + 'filesize': int_or_none(filed.get('size')), + 'ext': filed.get('ext'), + 'source_preference': -1, + } + formats.append(f) + + if filed.get('cdn_url'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['cdn_url']) + f['format_id'] = filed['profile_id'] + '-cdn' + f['source_preference'] = 1 + formats.append(f) + + if filed.get('html5_video_source'): + f = f.copy() + f['url'] = self._proto_relative_url( + filed['html5_video_source']) + f['format_id'] = filed['profile_id'] + '-html5' + f['source_preference'] = 0 + formats.append(f) + self._sort_formats(formats) + + categories = [ + t.get('text') for t in data.get('tags', []) if 'text' in t] return { '_type': 'video', 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, + 'title': data['title'], 'formats': formats, + 'description': data.get('description'), + 'timestamp': int_or_none(data.get('upload_time')), + 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), + 'uploader': data.get('author'), + 'duration': float_or_none(data.get('length')), + 'view_count': int_or_none(data.get('view_count')), + 'categories': categories, } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py new file mode 100644 index 000000000..669979e13 --- /dev/null +++ b/youtube_dl/extractor/vidzi.py @@ -0,0 +1,33 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VidziIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' + _TEST = { + 'url': 'http://vidzi.tv/cghql9yq6emu.html', + 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', + 'info_dict': { + 'id': 'cghql9yq6emu', + 'ext': 'mp4', + 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + video_url = self._html_search_regex( + r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + title = self._html_search_regex( + r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } +
\ No newline at end of file diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d2c36b58a..d9cad0ea5 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,13 +8,11 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( - clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, compat_urlparse, ExtractorError, - get_element_by_attribute, InAdvancePagedList, int_or_none, RegexNotFoundError, @@ -56,7 +54,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - (?P<proto>(?:https?:)?//)? + https?:// (?:(?:www|(?P<player>player))\.)? vimeo(?P<pro>pro)?\.com/ (?!channels/[^/?#]+/?(?:$|[?#])|album/) @@ -157,6 +155,18 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'duration': 62, } }, + { + # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ + 'url': 'https://player.vimeo.com/video/98044508', + 'note': 'The js code contains assignments to the same variable as the config', + 'info_dict': { + 'id': '98044508', + 'ext': 'mp4', + 'title': 'Pier Solar OUYA Official Trailer', + 'uploader': 'Tulio Gonçalves', + 'uploader_id': 'user28849593', + }, + }, ] def _verify_video_password(self, url, video_id, webpage): @@ -244,7 +254,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): # We try to find out to which variable is assigned the config dic m_variable_name = re.search('(\w)\.video\.id', webpage) if m_variable_name is not None: - config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) + config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] config = self._search_regex(config_re, webpage, 'info section', @@ -502,7 +512,7 @@ class VimeoReviewIE(InfoExtractor): 'info_dict': { 'id': '91613211', 'ext': 'mp4', - 'title': 'Death by dogma versus assembling agile - Sander Hoogendoorn', + 'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn', 'uploader': 'DevWeek Events', 'duration': 2773, 'thumbnail': 're:^https?://.*\.jpg$', diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index e7754158d..42995226e 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -70,7 +70,7 @@ class VineUserIE(InfoExtractor): 'info_dict': { 'id': 'Visa', }, - 'playlist_mincount': 47, + 'playlist_mincount': 46, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py new file mode 100644 index 000000000..57ef8dc30 --- /dev/null +++ b/youtube_dl/extractor/vrt.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import float_or_none + + +class VRTIE(InfoExtractor): + _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*' + _TESTS = [ + # deredactie.be + { + 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', + 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', + 'info_dict': { + 'id': '2129880', + 'ext': 'flv', + 'title': 'Het journaal L - 25/10/14', + 'description': None, + 'timestamp': 1414271750.949, + 'upload_date': '20141025', + 'duration': 929, + } + }, + # sporza.be + { + 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', + 'md5': '11f53088da9bf8e7cfc42456697953ff', + 'info_dict': { + 'id': '2124639', + 'ext': 'flv', + 'title': 'Bekijk Extra Time van 20 oktober', + 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', + 'timestamp': 1413835980.560, + 'upload_date': '20141020', + 'duration': 3238, + } + }, + # cobra.be + { + 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', + 'md5': '78a2b060a5083c4f055449a72477409d', + 'info_dict': { + 'id': '2126050', + 'ext': 'flv', + 'title': 'Bret Easton Ellis in Café Corsari', + 'description': 'md5:f699986e823f32fd6036c1855a724ee9', + 'timestamp': 1413967500.494, + 'upload_date': '20141022', + 'duration': 661, + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) + + formats = [] + mobj = re.search( + r'data-video-iphone-server="(?P<server>[^"]+)"\s+data-video-iphone-path="(?P<path>[^"]+)"', + webpage) + if mobj: + formats.extend(self._extract_m3u8_formats( + '%s/%s' % (mobj.group('server'), mobj.group('path')), + video_id, 'mp4')) + mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage) + if mobj: + formats.extend(self._extract_f4m_formats( + '%s/manifest.f4m' % mobj.group('src'), video_id)) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = float_or_none(self._search_regex( + r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) + duration = float_or_none(self._search_regex( + r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 273d93d9e..c3bb9b2cf 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -20,7 +20,7 @@ class XTubeIE(InfoExtractor): 'id': 'kVTUy_G222_', 'ext': 'mp4', 'title': 'strange erotica', - 'description': 'surreal gay themed erotica...almost an ET kind of thing', + 'description': 'http://www.xtube.com an ET kind of thing', 'uploader': 'greenshowers', 'duration': 450, 'age_limit': 18, diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 944d7da38..9cd7989cc 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -13,7 +13,6 @@ class YnetIE(InfoExtractor): _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'md5': '4b29cb57c3dddd57642b3f051f535b07', 'info_dict': { 'id': 'L-11659-99244', 'ext': 'flv', @@ -22,7 +21,6 @@ class YnetIE(InfoExtractor): } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', - 'md5': '8194c2ea221e9a639cac96b6b0753dc5', 'info_dict': { 'id': 'L-8859-84418', 'ext': 'flv', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9041cfa87..40fe4662a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -185,14 +185,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._download_webpage( req, None, - note='Confirming age', errnote='Unable to confirm age') - return True + note='Confirming age', errnote='Unable to confirm age', + fatal=False) def _real_initialize(self): if self._downloader is None: return - if not self._set_language(): - return + if self._get_login_info()[0] is not None: + if not self._set_language(): + return if not self._login(): return self._confirm_age() @@ -286,6 +287,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -938,7 +940,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest - if (self._downloader.params.get('youtube_include_dash_manifest', False)): + if self._downloader.params.get('youtube_include_dash_manifest', True): try: # The DASH manifest used needs to be the one from the original video_webpage. # The one found in get_video_info seems to be using different signatures. @@ -1055,7 +1057,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'note': 'issue #673', 'url': 'PLBB231211A4F62143', 'info_dict': { - 'title': 'Team Fortress 2 (Class-based LP)', + 'title': '[OLD]Team Fortress 2 (Class-based LP)', }, 'playlist_mincount': 26, }, { |