diff options
Diffstat (limited to 'youtube_dl/extractor')
36 files changed, 1289 insertions, 326 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9ee3f9190..6ab3eeaf5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -135,12 +135,14 @@ from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE from .godtube import GodTubeIE +from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE from .hark import HarkIE +from .heise import HeiseIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hornbunny import HornBunnyIE @@ -199,6 +201,7 @@ from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE +from .mgoon import MgoonIE from .ministrygrid import MinistryGridIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE @@ -239,6 +242,7 @@ from .ndtv import NDTVIE from .newgrounds import NewgroundsIE from .newstube import NewstubeIE from .nfb import NFBIE +from .nfl import NFLIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE @@ -248,7 +252,10 @@ from .nosvideo import NosVideoIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE -from .npo import NPOIE +from .npo import ( + NPOIE, + TegenlichtVproIE, +) from .nrk import ( NRKIE, NRKTVIE, @@ -256,6 +263,7 @@ from .nrk import ( from .ntv import NTVIE from .nytimes import NYTimesIE from .nuvid import NuvidIE +from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .orf import ( ORFTVthekIE, @@ -335,6 +343,7 @@ from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .sport5 import Sport5IE from .sportdeutschland import SportDeutschlandIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE @@ -362,6 +371,7 @@ from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE from .tnaflix import TNAFlixIE +from .thvideo import THVideoIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE @@ -445,9 +455,11 @@ from .yahoo import ( YahooNewsIE, YahooSearchIE, ) +from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE +from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, YoutubeChannelIE, diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 7d89f44ee..69f89320c 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -22,8 +22,7 @@ class ABCIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) urls_info_json = self._search_regex( diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py index bc64423a3..ad86d6e58 100644 --- a/youtube_dl/extractor/anysex.py +++ b/youtube_dl/extractor/anysex.py @@ -35,7 +35,7 @@ class AnySexIE(InfoExtractor): title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') description = self._html_search_regex( - r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False) + r'<div class="description"[^>]*>([^<]+)</div>', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) @@ -43,7 +43,7 @@ class AnySexIE(InfoExtractor): r'<a href="http://anysex\.com/categories/[^"]+" title="[^"]*">([^<]+)</a>', webpage) duration = parse_duration(self._search_regex( - r'<b>Duration:</b> (\d+:\d+)', webpage, 'duration', fatal=False)) + r'<b>Duration:</b> (?:<q itemprop="duration">)?(\d+:\d+)', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'<b>Views:</b> (\d+)', webpage, 'view count', fatal=False)) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 54cec1c2f..8de9c11ea 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,8 +8,6 @@ from ..utils import ( determine_ext, ExtractorError, qualities, - compat_urllib_parse_urlparse, - compat_urllib_parse, int_or_none, parse_duration, unified_strdate, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c30a1d33..f43a0a569 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import base64 +import datetime import hashlib import json import netrc @@ -15,11 +16,13 @@ from ..utils import ( compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urlparse, compat_str, clean_html, compiled_regex_type, ExtractorError, + float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, @@ -164,6 +167,14 @@ class InfoExtractor(object): return cls._VALID_URL_RE.match(url) is not None @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') + + @classmethod def working(cls): """Getter method for _WORKING.""" return cls._WORKING @@ -640,7 +651,9 @@ class InfoExtractor(object): return formats - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None): + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, + entry_protocol='m3u8', preference=None): + formats = [{ 'format_id': 'm3u8-meta', 'url': m3u8_url, @@ -651,6 +664,11 @@ class InfoExtractor(object): 'format_note': 'Quality selection URL', }] + format_url = lambda u: ( + u + if re.match(r'^https?://', u) + else compat_urlparse.urljoin(m3u8_url, u)) + m3u8_doc = self._download_webpage(m3u8_url, video_id) last_info = None kv_rex = re.compile( @@ -667,15 +685,17 @@ class InfoExtractor(object): continue else: if last_info is None: - formats.append({'url': line}) + formats.append({'url': format_url(line)}) continue tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) f = { 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), - 'url': line.strip(), + 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, } codecs = last_info.get('CODECS') if codecs: @@ -695,6 +715,34 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + def _live_title(self, name): + """ Generate the title for a live video """ + now = datetime.datetime.now() + now_str = now.strftime("%Y-%m-%d %H:%M") + return name + ' ' + now_str + + def _int(self, v, name, fatal=False, **kwargs): + res = int_or_none(v, **kwargs) + if 'get_attr' in kwargs: + print(getattr(v, kwargs['get_attr'])) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + + def _float(self, v, name, fatal=False, **kwargs): + res = float_or_none(v, **kwargs) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4903764f7..f99888ecc 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..utils import ( ExtractorError, compat_urllib_parse, @@ -26,7 +26,7 @@ from ..aes import ( ) -class CrunchyrollIE(InfoExtractor): +class CrunchyrollIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' _TEST = { 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -271,6 +271,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + return { 'id': video_id, 'title': video_title, diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py index 4ca3f37a2..b88379e06 100644 --- a/youtube_dl/extractor/divxstage.py +++ b/youtube_dl/extractor/divxstage.py @@ -7,7 +7,7 @@ class DivxStageIE(NovaMovIE): IE_NAME = 'divxstage' IE_DESC = 'DivxStage' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag)'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'} _HOST = 'www.divxstage.eu' @@ -24,4 +24,4 @@ class DivxStageIE(NovaMovIE): 'title': 'youtubedl test video', 'description': 'This is a test video for youtubedl.', } - }
\ No newline at end of file + } diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 1e1763abf..817a9bd61 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,24 +5,29 @@ import os.path import re from .common import InfoExtractor -from ..utils import compat_urllib_parse_unquote +from ..utils import compat_urllib_parse_unquote, url_basename class DropboxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*' + _TESTS = [{ 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0', 'info_dict': { 'id': 'nelirfsxnmcfbfh', 'ext': 'mp4', 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } - } + }, + { + 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - fn = compat_urllib_parse_unquote(mobj.group('title')) + fn = compat_urllib_parse_unquote(url_basename(url)) title = os.path.splitext(fn)[0] video_url = ( re.sub(r'[?&]dl=0', '', url) + diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 4ba323148..2cba82532 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,20 +9,20 @@ from ..utils import ExtractorError class EitbIE(InfoExtractor): - IE_NAME = u'eitb.tv' + IE_NAME = 'eitb.tv' _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' _TEST = { - u'add_ie': ['Brightcove'], - u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', - u'md5': u'edf4436247185adee3ea18ce64c47998', - u'info_dict': { - u'id': u'2743577154001', - u'ext': u'mp4', - u'title': u'60 minutos (Lasa y Zabala, 30 años)', + 'add_ie': ['Brightcove'], + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'md5': 'edf4436247185adee3ea18ce64c47998', + 'info_dict': { + 'id': '2743577154001', + 'ext': 'mp4', + 'title': '60 minutos (Lasa y Zabala, 30 años)', # All videos from eitb has this description in the brightcove info - u'description': u'.', - u'uploader': u'Euskal Telebista', + 'description': '.', + 'uploader': 'Euskal Telebista', }, } @@ -30,7 +32,7 @@ class EitbIE(InfoExtractor): webpage = self._download_webpage(url, chapter_id) bc_url = BrightcoveIE._extract_brightcove_url(webpage) if bc_url is None: - raise ExtractorError(u'Could not extract the Brightcove url') + raise ExtractorError('Could not extract the Brightcove url') # The BrightcoveExperience object doesn't contain the video id, we set # it manually bc_url += '&%40videoPlayer={0}'.format(chapter_id) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 14a196ffc..aacbf1414 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -7,6 +7,7 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, + str_to_int, ) @@ -20,6 +21,7 @@ class ExtremeTubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', + 'view_count': int, 'age_limit': 18, } }, { @@ -39,8 +41,12 @@ class ExtremeTubeIE(InfoExtractor): video_title = self._html_search_regex( r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( - r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader', - fatal=False) + r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', + webpage, 'uploader', fatal=False) + view_count = str_to_int(self._html_search_regex( + r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', + webpage, 'view count', fatal=False)) + video_url = compat_urllib_parse.unquote(self._html_search_regex( r'video_url=(.+?)&', webpage, 'video_url')) path = compat_urllib_parse_urlparse(video_url).path @@ -51,6 +57,7 @@ class ExtremeTubeIE(InfoExtractor): 'id': video_id, 'title': video_title, 'uploader': uploader, + 'view_count': view_count, 'url': video_url, 'format': format, 'format_id': format, diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 21ea5ec2b..e09982e88 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -10,13 +10,13 @@ from ..utils import ( class FlickrIE(InfoExtractor): - """Information Extractor for Flickr videos""" - _VALID_URL = r'(?:https?://)?(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' _TEST = { 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'file': '5645318632.mp4', 'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', 'info_dict': { + 'id': '5645318632', + 'ext': 'mp4', "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", "uploader_id": "forestwander-nature-pictures", "title": "Dark Hollow Waterfalls" @@ -49,12 +49,12 @@ class FlickrIE(InfoExtractor): raise ExtractorError('Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': self._og_search_title(webpage), + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id': video_uploader_id, - }] + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40eeaad16..367f930dd 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -382,6 +382,19 @@ class GenericIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, + # Wistia embed + { + 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'info_dict': { + 'id': '1cfaf6b7ea', + 'ext': 'mov', + 'title': 'md5:51364a8d3d009997ba99656004b5e20d', + 'duration': 643.0, + 'filesize': 182808282, + 'uploader': 'education-portal.com', + }, + }, ] def report_download_webpage(self, video_id): @@ -584,7 +597,9 @@ class GenericIE(InfoExtractor): # Helper method def _playlist_from_matches(matches, getter, ie=None): - urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches) + urlrs = orderedSet( + self.url_result(self._proto_relative_url(getter(m)), ie) + for m in matches) return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) @@ -629,11 +644,11 @@ class GenericIE(InfoExtractor): ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v)/.+?) + (?:embed|v|p)/.+?) \1''', webpage) if matches: return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1]), ie='Youtube') + matches, lambda m: unescapeHTML(m[1])) # Look for embedded Dailymotion player matches = re.findall( @@ -654,6 +669,16 @@ class GenericIE(InfoExtractor): 'title': video_title, 'id': video_id, } + match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + if match: + return { + '_type': 'url_transparent', + 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), + 'ie_key': 'Wistia', + 'uploader': video_uploader, + 'title': video_title, + 'id': match.group('id') + } # Look for embedded blip.tv player mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..bebfe8568 --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + determine_ext, +) + + +class GolemIE(InfoExtractor): + _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/' + _TEST = { + 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', + 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', + 'info_dict': { + 'id': '14095', + 'format_id': 'high', + 'ext': 'mp4', + 'title': 'iPhone 6 und 6 Plus - Test', + 'duration': 300.44, + 'filesize': 65309548, + } + } + + _PREFIX = 'http://video.golem.de' + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_xml( + 'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id) + + info = { + 'id': video_id, + 'title': config.findtext('./title', 'golem'), + 'duration': self._float(config.findtext('./playtime'), 'duration'), + } + + formats = [] + for e in config.findall('./*[url]'): + url = e.findtext('./url') + if not url: + self._downloader.report_warning( + "{0}: url: empty, skipping".format(e.tag)) + continue + + formats.append({ + 'format_id': e.tag, + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'height': self._int(e.get('height'), 'height'), + 'width': self._int(e.get('width'), 'width'), + 'filesize': self._int(e.findtext('filesize'), 'filesize'), + 'ext': determine_ext(e.findtext('./filename')), + }) + self._sort_formats(formats) + info['formats'] = formats + + thumbnails = [] + for e in config.findall('.//teaser[url]'): + url = e.findtext('./url') + if not url: + continue + thumbnails.append({ + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'width': self._int(e.get('width'), 'thumbnail width'), + 'height': self._int(e.get('height'), 'thumbnail height'), + }) + info['thumbnails'] = thumbnails + + return info diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py new file mode 100644 index 000000000..f97b1e085 --- /dev/null +++ b/youtube_dl/extractor/heise.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + get_meta_content, + parse_iso8601, +) + + +class HeiseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?heise\.de/video/artikel/ + .+?(?P<id>[0-9]+)\.html(?:$|[?#]) + ''' + _TEST = { + 'url': ( + 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' + ), + 'md5': 'ffed432483e922e88545ad9f2f15d30e', + 'info_dict': { + 'id': '2404147', + 'ext': 'mp4', + 'title': ( + "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" + ), + 'format_id': 'mp4_720', + 'timestamp': 1411812600, + 'upload_date': '20140927', + 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + json_url = self._search_regex( + r'json_url:\s*"([^"]+)"', webpage, 'json URL') + config = self._download_json(json_url, video_id) + + info = { + 'id': video_id, + 'thumbnail': config.get('poster'), + 'timestamp': parse_iso8601(get_meta_content('date', webpage)), + 'description': self._og_search_description(webpage), + } + + title = get_meta_content('fulltitle', webpage) + if title: + info['title'] = title + elif config.get('title'): + info['title'] = config['title'] + else: + info['title'] = self._og_search_title(webpage) + + formats = [] + for t, rs in config['formats'].items(): + if not rs or not hasattr(rs, 'items'): + self._downloader.report_warning( + 'formats: {0}: no resolutions'.format(t)) + continue + + for height_str, obj in rs.items(): + format_id = '{0}_{1}'.format(t, height_str) + + if not obj or not obj.get('url'): + self._downloader.report_warning( + 'formats: {0}: no url'.format(format_id)) + continue + + formats.append({ + 'url': obj['url'], + 'format_id': format_id, + 'height': self._int(height_str, 'height'), + }) + + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/mgoon.py b/youtube_dl/extractor/mgoon.py new file mode 100644 index 000000000..94bc87b00 --- /dev/null +++ b/youtube_dl/extractor/mgoon.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, +) + + +class MgoonIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| + video\.mgoon\.com)/(?P<id>[0-9]+)''' + _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' + _TESTS = [ + { + 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148', + 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d', + 'info_dict': { + 'id': '5582148', + 'uploader_id': 'hi6618', + 'duration': 240.419, + 'upload_date': '20131220', + 'ext': 'mp4', + 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://www.mgoon.com/play/view/5582148', + 'only_matching': True, + }, + { + 'url': 'http://video.mgoon.com/5582148', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + data = self._download_json(self._API_URL.format(video_id), video_id) + + if data.get('errorInfo', {}).get('code') != 'NONE': + raise ExtractorError('%s encountered an error: %s' % ( + self.IE_NAME, data['errorInfo']['message']), expected=True) + + v_info = data['videoInfo'] + title = v_info.get('v_title') + thumbnail = v_info.get('v_thumbnail') + duration = v_info.get('v_duration') + upload_date = unified_strdate(v_info.get('v_reg_date')) + uploader_id = data.get('userInfo', {}).get('u_alias') + if duration: + duration /= 1000.0 + + age_limit = None + if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT': + age_limit = 18 + + formats = [] + get_quality = qualities(['360p', '480p', '720p', '1080p']) + for fmt in data['videoFiles']: + formats.append({ + 'format_id': fmt['label'], + 'quality': get_quality(fmt['label']), + 'url': fmt['url'], + 'ext': fmt['format'], + + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index 3a938861b..c7f6beb9c 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import json from .common import InfoExtractor @@ -23,6 +22,7 @@ class MuenchenTVIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, @@ -33,9 +33,7 @@ class MuenchenTVIE(InfoExtractor): display_id = 'live' webpage = self._download_webpage(url, display_id) - now = datetime.datetime.now() - now_str = now.strftime("%Y-%m-%d %H:%M") - title = self._og_search_title(webpage) + ' ' + now_str + title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( r'(?s)\nplaylist:\s*(\[.*?}\]),related:', @@ -73,5 +71,6 @@ class MuenchenTVIE(InfoExtractor): 'title': title, 'formats': formats, 'is_live': True, + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2e4acbad..e75ab7c39 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -16,9 +16,9 @@ class NBCIE(InfoExtractor): _TEST = { 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188', - 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e', + # md5 checksum is not stable 'info_dict': { - 'id': 'u1RInQZRN7QJ', + 'id': 'bTmnLCvIbaaH', 'ext': 'flv', 'title': 'I Am a Firefighter', 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.', diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py new file mode 100644 index 000000000..963c4587c --- /dev/null +++ b/youtube_dl/extractor/nfl.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + remove_end, +) + + +class NFLIE(InfoExtractor): + IE_NAME = 'nfl.com' + _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' + _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' + _TEST = { + 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', + # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates + 'info_dict': { + 'id': '0ap3000000398478', + 'ext': 'mp4', + 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', + 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', + 'upload_date': '20140921', + 'timestamp': 1411337580, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + config = self._download_json(self._PLAYER_CONFIG_URL, video_id, + note='Downloading player config') + url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) + video_data = self._download_json(url_template.format(id=video_id), video_id) + + cdns = config.get('cdns') + if not cdns: + raise ExtractorError('Failed to get CDN data', expected=True) + + formats = [] + streams = video_data.get('cdnData', {}).get('bitrateInfo', []) + for name, cdn in cdns.items(): + # LimeLight streams don't seem to work + if cdn.get('name') == 'LIMELIGHT': + continue + + protocol = cdn.get('protocol') + host = remove_end(cdn.get('host', ''), '/') + if not (protocol and host): + continue + + path_prefix = cdn.get('pathprefix', '') + if path_prefix and not path_prefix.endswith('/'): + path_prefix = '%s/' % path_prefix + + get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( + protocol=protocol, + host=host, + prefix=path_prefix, + path=p, + ) + + if protocol == 'rtmp': + preference = -2 + elif 'prog' in name.lower(): + preference = -1 + else: + preference = 0 + + for stream in streams: + path = stream.get('path') + if not path: + continue + + formats.append({ + 'url': get_url(path), + 'vbr': int_or_none(stream.get('rate', 0), 1000), + 'preference': preference, + 'format_note': name, + }) + + self._sort_formats(formats) + + thumbnail = None + for q in ('xl', 'l', 'm', 's', 'xs'): + thumbnail = video_data.get('imagePaths', {}).get(q) + if thumbnail: + break + + return { + 'id': video_id, + 'title': video_data.get('storyHeadline'), + 'formats': formats, + 'description': video_data.get('caption'), + 'duration': video_data.get('duration'), + 'thumbnail': thumbnail, + 'timestamp': int_or_none(video_data.get('posted'), 1000), + } diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 959fdf590..7f1bc6377 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import re +import time +import hashlib from .common import InfoExtractor from ..utils import ( @@ -17,6 +19,7 @@ from ..utils import ( class NocoIE(InfoExtractor): _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' _LOGIN_URL = 'http://noco.tv/do.php' + _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _NETRC_MACHINE = 'noco' _TEST = { @@ -55,33 +58,52 @@ class NocoIE(InfoExtractor): login = self._download_json(request, None, 'Logging in as %s' % username) if 'erreur' in login: - raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + + def _call_api(self, path, video_id, note): + ts = compat_str(int(time.time() * 1000)) + tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() + url = self._API_URL_TEMPLATE % (path, ts, tk) + + resp = self._download_json(url, video_id, note) + + if isinstance(resp, dict) and resp.get('error'): + self._raise_error(resp['error'], resp['description']) + + return resp + + def _raise_error(self, error, description): + raise ExtractorError( + '%s returned error: %s - %s' % (self.IE_NAME, error, description), + expected=True) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - medias = self._download_json( - 'https://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON') + medias = self._call_api( + 'shows/%s/medias' % video_id, + video_id, 'Downloading video JSON') + + qualities = self._call_api( + 'qualities', + video_id, 'Downloading qualities JSON') formats = [] - for fmt in medias['fr']['video_list']['default']['quality_list']: - format_id = fmt['quality_key'] + for format_id, fmt in medias['fr']['video_list']['none']['quality_list'].items(): - file = self._download_json( - 'https://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id), + video = self._call_api( + 'shows/%s/video/%s/fr' % (video_id, format_id.lower()), video_id, 'Downloading %s video JSON' % format_id) - file_url = file['file'] + file_url = video['file'] if not file_url: continue - if file_url == 'forbidden': - raise ExtractorError( - '%s returned error: %s - %s' % ( - self.IE_NAME, file['popmessage']['title'], file['popmessage']['message']), - expected=True) + if file_url in ['forbidden', 'not found']: + popmessage = video['popmessage'] + self._raise_error(popmessage['title'], popmessage['message']) formats.append({ 'url': file_url, @@ -91,20 +113,31 @@ class NocoIE(InfoExtractor): 'abr': fmt['audiobitrate'], 'vbr': fmt['videobitrate'], 'filesize': fmt['filesize'], - 'format_note': fmt['quality_name'], - 'preference': fmt['priority'], + 'format_note': qualities[format_id]['quality_name'], + 'preference': qualities[format_id]['priority'], }) self._sort_formats(formats) - show = self._download_json( - 'https://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0] + show = self._call_api( + 'shows/by_id/%s' % video_id, + video_id, 'Downloading show JSON')[0] - upload_date = unified_strdate(show['indexed']) + upload_date = unified_strdate(show['online_date_start_utc']) uploader = show['partner_name'] uploader_id = show['partner_key'] duration = show['duration_ms'] / 1000.0 - thumbnail = show['screenshot'] + + thumbnails = [] + for thumbnail_key, thumbnail_url in show.items(): + m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key) + if not m: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) episode = show.get('show_TT') or show.get('show_OT') family = show.get('family_TT') or show.get('family_OT') @@ -124,7 +157,7 @@ class NocoIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'upload_date': upload_date, 'uploader': uploader, 'uploader_id': uploader_id, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 7a154e94a..f36d446d2 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -7,6 +7,7 @@ from ..utils import ( unified_strdate, parse_duration, qualities, + url_basename, ) @@ -55,7 +56,9 @@ class NPOIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + return self._get_info(video_id) + def _get_info(self, video_id): metadata = self._download_json( 'http://e.omroep.nl/metadata/aflevering/%s' % video_id, video_id, @@ -106,3 +109,30 @@ class NPOIE(InfoExtractor): 'duration': parse_duration(metadata.get('tijdsduur')), 'formats': formats, } + + +class TegenlichtVproIE(NPOIE): + IE_NAME = 'tegenlicht.vpro.nl' + _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?' + + _TESTS = [ + { + 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'Tegenlicht', + 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'upload_date': '20130225', + }, + }, + ] + + def _real_extract(self, url): + name = url_basename(url) + webpage = self._download_webpage(url, name) + urn = self._html_search_meta('mediaurn', webpage) + info_page = self._download_json( + 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) + return self._get_info(info_page['mid']) diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py new file mode 100644 index 000000000..4a41c0542 --- /dev/null +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OktoberfestTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)' + + _TEST = { + 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', + 'info_dict': { + 'id': 'hb-zelt', + 'ext': 'mp4', + 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 're:^https?://.*\.jpg$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._live_title(self._html_search_regex( + r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) + + clip = self._search_regex( + r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') + ncurl = self._search_regex( + r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base') + video_url = ncurl + clip + thumbnail = self._search_regex( + r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage, + 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'is_live': True, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index 72df4d842..ebc046804 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -10,6 +10,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + str_to_int, ) @@ -29,6 +30,7 @@ class PlayFMIE(InfoExtractor): 'duration': 5627.428, 'upload_date': '20140712', 'view_count': int, + 'comment_count': int, 'thumbnail': 're:^https?://.*\.jpg$', }, } @@ -51,7 +53,8 @@ class PlayFMIE(InfoExtractor): recording = rec_doc.find('./recording') title = recording.find('./title').text - view_count = int_or_none(recording.find('./stats/playcount').text) + view_count = str_to_int(recording.find('./stats/playcount').text) + comment_count = str_to_int(recording.find('./stats/comments').text) duration = float_or_none(recording.find('./duration').text, scale=1000) thumbnail = recording.find('./image').text @@ -75,6 +78,7 @@ class PlayFMIE(InfoExtractor): 'title': title, 'upload_date': upload_date, 'view_count': view_count, + 'comment_count': comment_count, 'duration': duration, 'thumbnail': thumbnail, 'uploader': uploader, diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 34058fd4b..409f8540a 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -12,7 +12,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/single/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -21,12 +21,16 @@ class SBSIE(InfoExtractor): 'md5': '3150cf278965eeabb5b4cea1c963fe0a', 'info_dict': { 'id': '320403011771', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Dingo Conservation', 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction', 'thumbnail': 're:http://.*\.jpg', }, 'add_ies': ['generic'], + }, + { + 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py new file mode 100644 index 000000000..3f680bfc6 --- /dev/null +++ b/youtube_dl/extractor/sport5.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + media_id = mobj.group('id') + + webpage = self._download_webpage(url, media_id) + + video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') + + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) + + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) + + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) + + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'duration': duration, + 'categories': categories, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b6b2dba9c..0be793b1c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -5,6 +5,7 @@ import json from .common import InfoExtractor from ..utils import ( + compat_str, ExtractorError, xpath_with_ns, ) @@ -55,36 +56,44 @@ class ThePlatformIE(InfoExtractor): body = meta.find(_x('smil:body')) f4m_node = body.find(_x('smil:seq//smil:video')) - if f4m_node is not None: + if f4m_node is not None and '.f4m' in f4m_node.attrib['src']: f4m_url = f4m_node.attrib['src'] if 'manifest.f4m?' not in f4m_url: f4m_url += '?' # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' - formats = [{ - 'ext': 'flv', - 'url': f4m_url, - }] + formats = self._extract_f4m_formats(f4m_url, video_id) else: - base_url = head.find(_x('smil:meta')).attrib['base'] - switch = body.find(_x('smil:switch')) formats = [] - for f in switch.findall(_x('smil:video')): - attr = f.attrib - width = int(attr['width']) - height = int(attr['height']) - vbr = int(attr['system-bitrate']) // 1000 - format_id = '%dx%d_%dk' % (width, height, vbr) - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'play_path': 'mp4:' + attr['src'], - 'ext': 'flv', - 'width': width, - 'height': height, - 'vbr': vbr, - }) + switch = body.find(_x('smil:switch')) + if switch is not None: + base_url = head.find(_x('smil:meta')).attrib['base'] + for f in switch.findall(_x('smil:video')): + attr = f.attrib + width = int(attr['width']) + height = int(attr['height']) + vbr = int(attr['system-bitrate']) // 1000 + format_id = '%dx%d_%dk' % (width, height, vbr) + formats.append({ + 'format_id': format_id, + 'url': base_url, + 'play_path': 'mp4:' + attr['src'], + 'ext': 'flv', + 'width': width, + 'height': height, + 'vbr': vbr, + }) + else: + switch = body.find(_x('smil:seq//smil:switch')) + for f in switch.findall(_x('smil:video')): + attr = f.attrib + vbr = int(attr['system-bitrate']) // 1000 + formats.append({ + 'format_id': compat_str(vbr), + 'url': attr['src'], + 'vbr': vbr, + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py new file mode 100644 index 000000000..607e947bb --- /dev/null +++ b/youtube_dl/extractor/thvideo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate +) + + +class THVideoIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://thvideo.tv/v/th1987/', + 'md5': 'fa107b1f73817e325e9433505a70db50', + 'info_dict': { + 'id': '1987', + 'ext': 'mp4', + 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览', + 'display_id': 'th1987', + 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg', + 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...', + 'upload_date': '20140722' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # extract download link from mobile player page + webpage_player = self._download_webpage( + 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id), + video_id, note='Downloading video source page') + video_url = self._html_search_regex( + r'<source src="(.*?)" type', webpage_player, 'video url') + + # extract video info from main page + webpage = self._download_webpage( + 'http://thvideo.tv/v/th%s' % (video_id), video_id) + title = self._og_search_title(webpage) + display_id = 'th%s' % video_id + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_regex( + r'span itemprop="datePublished" content="(.*?)">', webpage, + 'upload date', fatal=False)) + + return { + 'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'description': description, + 'upload_date': upload_date + } diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 08a48c05a..64a1e9030 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -14,27 +14,35 @@ from ..aes import aes_decrypt_text class Tube8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P<id>\d+)' - _TEST = { - 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', - 'md5': '44bf12b98313827dd52d35b8706a4ea0', - 'info_dict': { - 'id': '229795', - 'ext': 'mp4', - 'description': 'hot teen Kasia grinding', - 'uploader': 'unknown', - 'title': 'Kasia music video', - 'age_limit': 18, - } - } + _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', + 'md5': '44bf12b98313827dd52d35b8706a4ea0', + 'info_dict': { + 'id': '229795', + 'display_id': 'kasia-music-video', + 'ext': 'mp4', + 'description': 'hot teen Kasia grinding', + 'uploader': 'unknown', + 'title': 'Kasia music video', + 'age_limit': 18, + } + }, + { + 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + webpage = self._download_webpage(req, display_id) flashvars = json.loads(self._html_search_regex( r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars')) @@ -70,6 +78,7 @@ class Tube8IE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'url': video_url, 'title': title, 'description': description, diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index df115d251..ebd64f0f5 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -19,7 +19,7 @@ class Vbox7IE(InfoExtractor): 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { 'id': '249bb972c2', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, } @@ -50,7 +50,6 @@ class Vbox7IE(InfoExtractor): return { 'id': video_id, 'url': final_url, - 'ext': 'flv', 'title': title, 'thumbnail': thumbnail_url, } diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index d2ffd1b6b..5b1a3ec78 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_HTTPError, + compat_urllib_request, ExtractorError, ) @@ -24,7 +24,7 @@ class VevoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - "md5": "06bea460acb744eab74a9d7dcb4bfd61", + "md5": "95ee28ee45e70130e3ab02b0f579ae23", 'info_dict': { 'id': 'GB1101300280', 'ext': 'mp4', @@ -40,7 +40,7 @@ class VevoIE(InfoExtractor): }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', - 'md5': '893ec0e0d4426a1d96c01de8f2bdff58', + 'md5': 'f6ab09b034f8c22969020b042e5ac7fc', 'info_dict': { 'id': 'USUV71302923', 'ext': 'mp4', @@ -69,6 +69,21 @@ class VevoIE(InfoExtractor): }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' + def _real_initialize(self): + req = compat_urllib_request.Request( + 'http://www.vevo.com/auth', data=b'') + webpage = self._download_webpage( + req, None, + note='Retrieving oauth token', + errnote='Unable to retrieve oauth token', + fatal=False) + if webpage is False: + self._oauth_token = None + else: + self._oauth_token = self._search_regex( + r'access_token":\s*"([^"]+)"', + webpage, 'access token', fatal=False) + def _formats_from_json(self, video_info): last_version = {'version': -1} for version in video_info['videoVersions']: @@ -129,6 +144,26 @@ class VevoIE(InfoExtractor): }) return formats + def _download_api_formats(self, video_id): + if not self._oauth_token: + self._downloader.report_warning( + 'No oauth token available, skipping API HLS download') + return [] + + api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( + video_id, self._oauth_token) + api_data = self._download_json( + api_url, video_id, + note='Downloading HLS formats', + errnote='Failed to download HLS format list', fatal=False) + if api_data is None: + return [] + + m3u8_url = api_data[0]['url'] + return self._extract_m3u8_formats( + m3u8_url, video_id, entry_protocol='m3u8_native', ext='mp4', + preference=0) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -152,30 +187,8 @@ class VevoIE(InfoExtractor): else: age_limit = None - # Download SMIL - smil_blocks = sorted(( - f for f in video_info['videoVersions'] - if f['sourceType'] == 13), - key=lambda f: f['version']) - - smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( - self._SMIL_BASE_URL, video_id, video_id.lower()) - if smil_blocks: - smil_url_m = self._search_regex( - r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL', - fatal=False) - if smil_url_m is not None: - smil_url = smil_url_m - - try: - smil_xml = self._download_webpage(smil_url, video_id, - 'Downloading SMIL info') - formats.extend(self._formats_from_smil(smil_xml)) - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError): - raise - self._downloader.report_warning( - 'Cannot download SMIL information, falling back to JSON ..') + # Download via HLS API + formats.extend(self._download_api_formats(video_id)) self._sort_formats(formats) timestamp_ms = int(self._search_regex( diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 2544c24bd..1b2f731e9 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, compat_str, + ExtractorError, ) @@ -16,6 +17,24 @@ class VubeIE(InfoExtractor): _TESTS = [ { + 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s', + 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42', + 'info_dict': { + 'id': 'Y8NUZ69Tf7', + 'ext': 'mp4', + 'title': 'Best Drummer Ever [HD]', + 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'William', + 'timestamp': 1406876915, + 'upload_date': '20140801', + 'duration': 258.051, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], + }, + }, { 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', 'md5': 'db7aba89d4603dadd627e9d1973946fe', 'info_dict': { @@ -32,7 +51,8 @@ class VubeIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], - } + }, + 'skip': 'Removed due to DMCA', }, { 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', @@ -51,7 +71,8 @@ class VubeIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'categories': ['seraina', 'jessica', 'krewella', 'alive'], - } + }, + 'skip': 'Removed due to DMCA', }, { 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', 'md5': '0584fc13b50f887127d9d1007589d27f', @@ -69,7 +90,8 @@ class VubeIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], - } + }, + 'skip': 'Removed due to DMCA', } ] @@ -102,6 +124,11 @@ class VubeIE(InfoExtractor): self._sort_formats(formats) + if not formats and video.get('vst') == 'dmca': + raise ExtractorError( + 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.', + expected=True) + title = video['title'] description = video.get('description') thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 46b4d9133..bf9e40bad 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -5,7 +5,10 @@ import re import hashlib from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + ExtractorError, + unified_strdate, +) class WatIE(InfoExtractor): @@ -37,6 +40,7 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', 'duration': 2910, }, + 'skip': "Ce contenu n'est pas disponible pour l'instant.", }, ] @@ -57,6 +61,11 @@ class WatIE(InfoExtractor): video_info = self.download_video_info(real_id) + error_desc = video_info.get('error_desc') + if error_desc: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + geo_list = video_info.get('geoList') country = geo_list[0] if geo_list else '' diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index e6bfa9e14..748443f81 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,13 +1,14 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor +from ..utils import ExtractorError, compat_urllib_request class WistiaIE(InfoExtractor): _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)' + _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json' _TEST = { 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', @@ -24,11 +25,13 @@ class WistiaIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - data_json = self._html_search_regex( - r'Wistia\.iframeInit\((.*?), {}\);', webpage, 'video data') - - data = json.loads(data_json) + request = compat_urllib_request.Request(self._API_URL.format(video_id)) + request.add_header('Referer', url) # Some videos require this. + data_json = self._download_json(request, video_id) + if data_json.get('error'): + raise ExtractorError('Error while getting the playlist', + expected=True) + data = data_json['media'] formats = [] thumbnails = [] diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py new file mode 100644 index 000000000..24872861a --- /dev/null +++ b/youtube_dl/extractor/ynet.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import compat_urllib_parse + + +class YnetIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' + _TESTS = [ + { + 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', + 'md5': '002b44ee2f33d50363a1c153bed524cf', + 'info_dict': { + 'id': 'L-11659-99244', + 'ext': 'flv', + 'title': 'איש לא יודע מאיפה באנו', + 'thumbnail': 're:^https?://.*\.jpg', + } + }, { + 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', + 'md5': '6455046ae1b48cf7e2b7cae285e53a16', + 'info_dict': { + 'id': 'L-8859-84418', + 'ext': 'flv', + 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין", + 'thumbnail': 're:^https?://.*\.jpg', + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) + config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config')) + f4m_url = config['clip']['url'] + title = self._og_search_title(webpage) + m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title) + if m: + title = m.group('title') + + return { + 'id': video_id, + 'title': title, + 'formats': self._extract_f4m_formats(f4m_url, video_id), + 'thumbnail': self._og_search_thumbnail(webpage), + }
\ No newline at end of file diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index a8fd40c83..48d47a245 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,6 +1,7 @@ # coding: utf-8 -import json +from __future__ import unicode_literals + import math import random import re @@ -13,18 +14,25 @@ from ..utils import ( class YoukuIE(InfoExtractor): - _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)' - _TEST = { - u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", - u"file": u"XNDgyMDQ2NTQw_part00.flv", - u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b", - u"params": {u"test": False}, - u"info_dict": { - u"title": u"youtube-dl test video \"'/\\ä↭𝕐" + _VALID_URL = r'''(?x) + (?: + http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + youku:) + (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) + ''' + _TEST = { + 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html', + 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b', + 'params': { + 'test': False + }, + 'info_dict': { + 'id': 'XNDgyMDQ2NTQw_part00', + 'ext': 'flv', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐' } } - def _gen_sid(self): nowTime = int(time.time() * 1000) random1 = random.randint(1000,1998) @@ -55,49 +63,42 @@ class YoukuIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('ID') + video_id = mobj.group('id') info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id - jsondata = self._download_webpage(info_url, video_id) - - self.report_extraction(video_id) - try: - config = json.loads(jsondata) - error_code = config['data'][0].get('error_code') - if error_code: - # -8 means blocked outside China. - error = config['data'][0].get('error') # Chinese and English, separated by newline. - raise ExtractorError(error or u'Server reported error %i' % error_code, - expected=True) - - video_title = config['data'][0]['title'] - seed = config['data'][0]['seed'] - - format = self._downloader.params.get('format', None) - supported_format = list(config['data'][0]['streamfileids'].keys()) - - if format is None or format == 'best': - if 'hd2' in supported_format: - format = 'hd2' - else: - format = 'flv' - ext = u'flv' - elif format == 'worst': - format = 'mp4' - ext = u'mp4' - else: - format = 'flv' - ext = u'flv' + config = self._download_json(info_url, video_id) + + error_code = config['data'][0].get('error_code') + if error_code: + # -8 means blocked outside China. + error = config['data'][0].get('error') # Chinese and English, separated by newline. + raise ExtractorError(error or 'Server reported error %i' % error_code, + expected=True) + video_title = config['data'][0]['title'] + seed = config['data'][0]['seed'] - fileid = config['data'][0]['streamfileids'][format] - keys = [s['k'] for s in config['data'][0]['segs'][format]] - # segs is usually a dictionary, but an empty *list* if an error occured. - except (UnicodeDecodeError, ValueError, KeyError): - raise ExtractorError(u'Unable to extract info section') + format = self._downloader.params.get('format', None) + supported_format = list(config['data'][0]['streamfileids'].keys()) + + # TODO proper format selection + if format is None or format == 'best': + if 'hd2' in supported_format: + format = 'hd2' + else: + format = 'flv' + ext = 'flv' + elif format == 'worst': + format = 'mp4' + ext = 'mp4' + else: + format = 'flv' + ext = 'flv' + + fileid = config['data'][0]['streamfileids'][format] + keys = [s['k'] for s in config['data'][0]['segs'][format]] + # segs is usually a dictionary, but an empty *list* if an error occured. files_info=[] sid = self._gen_sid() @@ -106,9 +107,8 @@ class YoukuIE(InfoExtractor): #column 8,9 of fileid represent the segment number #fileid[7:9] should be changed for index, key in enumerate(keys): - temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:]) - download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) + download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) info = { 'id': '%s_part%02d' % (video_id, index), diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py new file mode 100644 index 000000000..40fc4165f --- /dev/null +++ b/youtube_dl/extractor/yourupload.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class YourUploadIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:yourupload\.com/watch| + embed\.yourupload\.com| + embed\.yucache\.net + )/(?P<id>[A-Za-z0-9]+) + ''' + _TESTS = [ + { + 'url': 'http://yourupload.com/watch/14i14h', + 'md5': 'bf5c2f95c4c917536e80936af7bc51e1', + 'info_dict': { + 'id': '14i14h', + 'ext': 'mp4', + 'title': 'BigBuckBunny_320x180.mp4', + 'thumbnail': 're:^https?://.*\.jpe?g', + } + }, + { + 'url': 'http://embed.yourupload.com/14i14h', + 'only_matching': True, + }, + { + 'url': 'http://embed.yucache.net/14i14h?client_file_id=803349', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://embed.yucache.net/{0:}'.format(video_id) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + url = self._og_search_video_url(webpage) + + formats = [{ + 'format_id': 'sd', + 'url': url, + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b54c69122..99198e380 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,7 +46,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _set_language(self): return bool(self._download_webpage( self._LANG_URL, None, - note=u'Setting language', errnote='unable to set language', + note='Setting language', errnote='unable to set language', fatal=False)) def _login(self): @@ -61,13 +61,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # No authentication to be performed if username is None: if self._LOGIN_REQUIRED: - raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True login_page = self._download_webpage( self._LOGIN_URL, None, - note=u'Downloading login page', - errnote=u'unable to fetch login page', fatal=False) + note='Downloading login page', + errnote='unable to fetch login page', fatal=False) if login_page is False: return @@ -105,12 +105,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): req = compat_urllib_request.Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( req, None, - note=u'Logging in', errnote=u'unable to log in', fatal=False) + note='Logging in', errnote='unable to log in', fatal=False) if login_results is False: return False if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) # Two-Factor # TODO add SMS and phone call support - these require making a request and then prompting the user @@ -119,19 +119,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor): tfa_code = self._get_tfa_info() if tfa_code is None: - self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>') - self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)') + self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>') + self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False # Unlike the first login form, secTok and timeStmp are both required for the TFA form match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) if match is None: - self._downloader.report_warning(u'Failed to get secTok - did the page structure change?') + self._downloader.report_warning('Failed to get secTok - did the page structure change?') secTok = match.group(1) match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) if match is None: - self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?') + self._downloader.report_warning('Failed to get timeStmp - did the page structure change?') timeStmp = match.group(1) tfa_form_strs = { @@ -155,23 +155,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( tfa_req, None, - note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False) + note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) if tfa_results is False: return False if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None: - self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.') + self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') return False if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning(u'unable to log in - did the page structure change?') + self._downloader.report_warning('unable to log in - did the page structure change?') return False if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') + self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') return False if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') + self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -185,7 +185,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._download_webpage( req, None, - note=u'Confirming age', errnote=u'Unable to confirm age') + note='Confirming age', errnote='Unable to confirm age') return True def _real_initialize(self): @@ -211,7 +211,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ + (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -307,69 +307,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc", - u"file": u"BaW_jenozKc.mp4", - u"info_dict": { - u"title": u"youtube-dl test video \"'/\\ä↭𝕐", - u"uploader": u"Philipp Hagemeister", - u"uploader_id": u"phihag", - u"upload_date": u"20121002", - u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .", - u"categories": [u'Science & Technology'], + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], 'like_count': int, 'dislike_count': int, } }, { - u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY", - u"file": u"UxxajLWwzqY.mp4", - u"note": u"Test generic use_cipher_signature video (#897)", - u"info_dict": { - u"upload_date": u"20120506", - u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f", - u"uploader": u"Icona Pop", - u"uploader_id": u"IconaPop" + 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY', + 'note': 'Test generic use_cipher_signature video (#897)', + 'info_dict': { + 'id': 'UxxajLWwzqY', + 'ext': 'mp4', + 'upload_date': '20120506', + 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', + 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'uploader': 'Icona Pop', + 'uploader_id': 'IconaPop', } }, { - u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ", - u"file": u"07FYdnEawAQ.mp4", - u"note": u"Test VEVO video with age protection (#956)", - u"info_dict": { - u"upload_date": u"20130703", - u"title": u"Justin Timberlake - Tunnel Vision (Explicit)", - u"description": u"md5:64249768eec3bc4276236606ea996373", - u"uploader": u"justintimberlakeVEVO", - u"uploader_id": u"justintimberlakeVEVO" + 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', + 'note': 'Test VEVO video with age protection (#956)', + 'info_dict': { + 'id': '07FYdnEawAQ', + 'ext': 'mp4', + 'upload_date': '20130703', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'description': 'md5:64249768eec3bc4276236606ea996373', + 'uploader': 'justintimberlakeVEVO', + 'uploader_id': 'justintimberlakeVEVO', } }, { - u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ", - u"file": u"yZIXLfi8CZQ.mp4", - u"note": u"Embed-only video (#1746)", - u"info_dict": { - u"upload_date": u"20120608", - u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012", - u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7", - u"uploader": u"SET India", - u"uploader_id": u"setindia" + 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', + 'note': 'Embed-only video (#1746)', + 'info_dict': { + 'id': 'yZIXLfi8CZQ', + 'ext': 'mp4', + 'upload_date': '20120608', + 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', + 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', + 'uploader': 'SET India', + 'uploader_id': 'setindia' } }, { - u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", - u"file": u"a9LDPn-MO4I.m4a", - u"note": u"256k DASH audio (format 141) via DASH manifest", - u"info_dict": { - u"upload_date": "20121002", - u"uploader_id": "8KVIDEO", - u"description": '', - u"uploader": "8KVIDEO", - u"title": "UHDTV TEST 8K VIDEO.mp4" + 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', + 'note': '256k DASH audio (format 141) via DASH manifest', + 'info_dict': { + 'id': 'a9LDPn-MO4I', + 'ext': 'm4a', + 'upload_date': '20121002', + 'uploader_id': '8KVIDEO', + 'description': '', + 'uploader': '8KVIDEO', + 'title': 'UHDTV TEST 8K VIDEO.mp4' }, - u"params": { - u"youtube_include_dash_manifest": True, - u"format": "141", + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', }, }, # DASH manifest with encrypted signature @@ -384,7 +389,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', }, - u"params": { + 'params': { 'youtube_include_dash_manifest': True, 'format': '141', }, @@ -397,19 +402,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video info webpage' % video_id) + self.to_screen('%s: Downloading video info webpage' % video_id) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" - self.to_screen(u'%s: Extracting video information' % video_id) + self.to_screen('%s: Extracting video information' % video_id) def report_unavailable_format(self, video_id, format): """Report extracted video URL.""" - self.to_screen(u'%s: Format %s not available' % (video_id, format)) + self.to_screen('%s: Format %s not available' % (video_id, format)) def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" - self.to_screen(u'RTMP download detected') + self.to_screen('RTMP download detected') def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ @@ -429,21 +434,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_type, player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id - cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id) + cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) if player_type == 'js': code = self._download_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, player_id), - errnote=u'Download of %s failed' % player_url) + note='Downloading %s player %s' % (player_type, player_id), + errnote='Download of %s failed' % player_url) res = self._parse_sig_js(code) elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, player_id), - errnote=u'Download of %s failed' % player_url) + note='Downloading %s player %s' % (player_type, player_id), + errnote='Download of %s failed' % player_url) code = urlh.read() res = self._parse_sig_swf(code) else: @@ -454,15 +459,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] - self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec) + self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) return res def _print_sig_code(self, func, example_sig): def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) - ends = (u':%d' % (end+step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (u':%d' % step) + ends = (':%d' % (end+step)) if end + step >= 0 else ':' + steps = '' if step == 1 else (':%d' % step) return 's[%s%s%s]' % (starts, ends, steps) step = None @@ -492,9 +497,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n' + code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen(u'Extracted signature function:\n' + code) + self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( @@ -516,9 +521,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Turn the encrypted s field into a working signature""" if player_url is None: - raise ExtractorError(u'Cannot decrypt signature without player_url') + raise ExtractorError('Cannot decrypt signature without player_url') - if player_url.startswith(u'//'): + if player_url.startswith('//'): player_url = 'https:' + player_url try: player_id = (player_url, self._signature_cache_id(s)) @@ -542,7 +547,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) return {} lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) @@ -560,7 +565,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url = 'https://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url if not sub_lang_list: - self._downloader.report_warning(u'video doesn\'t have subtitles') + self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list @@ -568,7 +573,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" sub_format = self._downloader.params.get('subtitlesformat', 'srt') - self.to_screen(u'%s: Looking for automatic captions' % video_id) + self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id if mobj is None: @@ -589,7 +594,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : - self._downloader.report_warning(u'Video doesn\'t have automatic captions') + self._downloader.report_warning('Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] @@ -615,7 +620,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group(2) return video_id @@ -635,7 +640,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _extract_annotations(self, video_id): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id - return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') + return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') def _real_extract(self, url): proto = ( @@ -705,14 +710,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError(u'"rental" videos not supported') + raise ExtractorError('"rental" videos not supported') # Start extracting information self.report_information_extraction(video_id) # uploader if 'author' not in video_info: - raise ExtractorError(u'Unable to extract uploader name') + raise ExtractorError('Unable to extract uploader name') video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) # uploader_id @@ -721,13 +726,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if mobj is not None: video_uploader_id = mobj.group(1) else: - self._downloader.report_warning(u'unable to extract uploader nickname') + self._downloader.report_warning('unable to extract uploader nickname') # title if 'title' in video_info: video_title = video_info['title'][0] else: - self._downloader.report_warning(u'Unable to extract video title') + self._downloader.report_warning('Unable to extract video title') video_title = '_' # thumbnail image @@ -737,7 +742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if m_thumb is not None: video_thumbnail = m_thumb.group(1) elif 'thumbnail_url' not in video_info: - self._downloader.report_warning(u'unable to extract video thumbnail') + self._downloader.report_warning('unable to extract video thumbnail') video_thumbnail = None else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -791,8 +796,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if count is not None: return int(count.replace(',', '')) return None - like_count = _extract_count(u'like') - dislike_count = _extract_count(u'dislike') + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -802,7 +807,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return if 'length_seconds' not in video_info: - self._downloader.report_warning(u'unable to extract video duration') + self._downloader.report_warning('unable to extract video duration') video_duration = None else: video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) @@ -823,11 +828,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Easy way to know if the 's' value is in url_encoded_fmt_stream_map # this signatures are encrypted if 'url_encoded_fmt_stream_map' not in args: - raise ValueError(u'No stream_map present') # caught below + raise ValueError('No stream_map present') # caught below re_signature = re.compile(r'[&,]s=') m_s = re_signature.search(args['url_encoded_fmt_stream_map']) if m_s is not None: - self.to_screen(u'%s: Encrypted signatures detected.' % video_id) + self.to_screen('%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] m_s = re_signature.search(args.get('adaptive_fmts', '')) if m_s is not None: @@ -905,7 +910,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_desc = 'html5 player %s' % player_version parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen(u'{%s} signature length %s, %s' % + self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc)) signature = self._decrypt_signature( @@ -920,7 +925,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) else: - raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest if (self._downloader.params.get('youtube_include_dash_manifest', False)): @@ -941,9 +946,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) dash_doc = self._download_xml( dash_manifest_url, video_id, - note=u'Downloading DASH manifest', - errnote=u'Could not download DASH manifest') - for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + note='Downloading DASH manifest', + errnote='Could not download DASH manifest') + for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') if url_el is None: continue @@ -969,7 +974,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): existing_format.update(f) except (ExtractorError, KeyError) as e: - self.report_warning(u'Skipping DASH manifest: %s' % e, video_id) + self.report_warning('Skipping DASH manifest: %s' % e, video_id) self._sort_formats(formats) @@ -1000,7 +1005,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): (?:\w+\.)? youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch) + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) \? (?:.*?&)*? (?:p|a|list)= | p/ ) @@ -1056,6 +1061,20 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'title': 'YDL_safe_search', }, 'playlist_count': 2, + }, { + 'note': 'embedded', + 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + } + }, { + 'note': 'Embedded SWF player', + 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA7', + } }] def _real_initialize(self): @@ -1090,7 +1109,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # Extract playlist id mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) # Check if it's a video-specific URL @@ -1098,16 +1117,16 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if 'v' in query_dict: video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): - self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, 'Youtube', video_id=video_id) else: - self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) if playlist_id.startswith('RD'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) if playlist_id.startswith('TL'): - raise ExtractorError(u'For downloading YouTube.com top lists, use ' + raise ExtractorError('For downloading YouTube.com top lists, use ' 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) url = self._TEMPLATE_URL % playlist_id @@ -1152,19 +1171,28 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): class YoutubeTopListIE(YoutubePlaylistIE): IE_NAME = 'youtube:toplist' - IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"' + IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' ' (Example: "yttoplist:music:Top Tracks")') _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' - _TESTS = [] + _TESTS = [{ + 'url': 'yttoplist:music:Trending', + 'playlist_mincount': 5, + 'skip': 'Only works for logged-in users', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) channel = mobj.group('chann') title = mobj.group('title') query = compat_urllib_parse.urlencode({'title': title}) - playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query) - channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex(playlist_re, channel_page, 'list') + channel_page = self._download_webpage( + 'https://www.youtube.com/%s' % channel, title) + link = self._html_search_regex( + r'''(?x) + <a\s+href="([^"]+)".*?>\s* + <span\s+class="branded-page-module-title-text">\s* + <span[^>]*>.*?%s.*?</span>''' % re.escape(query), + channel_page, 'list') url = compat_urlparse.urljoin('https://www.youtube.com/', link) video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' @@ -1190,6 +1218,11 @@ class YoutubeChannelIE(InfoExtractor): _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = 'youtube:channel' + _TESTS = [{ + 'note': 'paginated channel', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'playlist_mincount': 91, + }] def extract_videos_from_page(self, page): ids_in_page = [] @@ -1202,7 +1235,7 @@ class YoutubeChannelIE(InfoExtractor): # Extract channel id mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) # Download channel page channel_id = mobj.group(1) @@ -1224,7 +1257,7 @@ class YoutubeChannelIE(InfoExtractor): for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) page = self._download_json( - url, channel_id, note=u'Downloading page #%s' % pagenum, + url, channel_id, note='Downloading page #%s' % pagenum, transform_source=uppercase_escape) ids_in_page = self.extract_videos_from_page(page['content_html']) @@ -1233,7 +1266,7 @@ class YoutubeChannelIE(InfoExtractor): if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: break - self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) + self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) for video_id in video_ids] @@ -1248,6 +1281,17 @@ class YoutubeUserIE(InfoExtractor): _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = 'youtube:user' + _TESTS = [{ + 'url': 'https://www.youtube.com/user/TheLinuxFoundation', + 'playlist_mincount': 320, + 'info_dict': { + 'title': 'TheLinuxFoundation', + } + }, { + 'url': 'ytuser:phihag', + 'only_matching': True, + }] + @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube @@ -1260,7 +1304,7 @@ class YoutubeUserIE(InfoExtractor): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) username = mobj.group(1) @@ -1281,7 +1325,7 @@ class YoutubeUserIE(InfoExtractor): try: response = json.loads(page) except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + raise ExtractorError('Invalid JSON in API response: ' + compat_str(err)) if 'entry' not in response['feed']: return @@ -1322,9 +1366,9 @@ class YoutubeSearchIE(SearchInfoExtractor): compat_urllib_parse.quote_plus(query.encode('utf-8')), (PAGE_SIZE * pagenum) + 1) data_json = self._download_webpage( - result_url, video_id=u'query "%s"' % query, - note=u'Downloading page %s' % (pagenum + 1), - errnote=u'Unable to download API page') + result_url, video_id='query "%s"' % query, + note='Downloading page %s' % (pagenum + 1), + errnote='Unable to download API page') data = json.loads(data_json) api_response = data['data'] @@ -1356,6 +1400,13 @@ class YoutubeSearchURLIE(InfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'title': 'youtube-dl test video', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -1390,17 +1441,38 @@ class YoutubeSearchURLIE(InfoExtractor): class YoutubeShowIE(InfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(.*)' + _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' + _TESTS = [{ + 'url': 'http://www.youtube.com/show/airdisasters', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'airdisasters', + 'title': 'Air Disasters', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - show_name = mobj.group(1) - webpage = self._download_webpage(url, show_name, 'Downloading show webpage') + playlist_id = mobj.group('id') + webpage = self._download_webpage( + url, playlist_id, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) - self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons))) - return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] + self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) + entries = [ + self.url_result( + 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist') + for season in m_seasons + ] + title = self._og_search_title(webpage, fatal=False) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'entries': entries, + } class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): |