diff options
Diffstat (limited to 'youtube_dl')
54 files changed, 1628 insertions, 333 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b07c0b4cc..76726305a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -134,13 +134,16 @@ from .gamestar import GameStarIE from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .globo import GloboIE from .godtube import GodTubeIE +from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE from .hark import HarkIE +from .heise import HeiseIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hornbunny import HornBunnyIE @@ -188,6 +191,7 @@ from .livestream import ( LivestreamOriginalIE, LivestreamShortenerIE, ) +from .lrt import LRTIE from .lynda import ( LyndaIE, LyndaCourseIE @@ -261,6 +265,7 @@ from .nrk import ( from .ntv import NTVIE from .nytimes import NYTimesIE from .nuvid import NuvidIE +from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .orf import ( ORFTVthekIE, @@ -271,6 +276,8 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .planetaplay import PlanetaPlayIE +from .played import PlayedIE from .playfm import PlayFMIE from .playvid import PlayvidIE from .podomatic import PodomaticIE @@ -350,6 +357,7 @@ from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE +from .tapely import TapelyIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -363,11 +371,15 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE from .theplatform import ThePlatformIE +from .thesixtyone import TheSixtyOneIE from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE from .tnaflix import TNAFlixIE -from .thvideo import THVideoIE +from .thvideo import ( + THVideoIE, + THVideoPlaylistIE +) from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE @@ -408,11 +420,12 @@ from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vimeo import ( VimeoIE, - VimeoChannelIE, - VimeoUserIE, VimeoAlbumIE, + VimeoChannelIE, VimeoGroupsIE, + VimeoLikesIE, VimeoReviewIE, + VimeoUserIE, VimeoWatchLaterIE, ) from .vimple import VimpleIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 7d89f44ee..69f89320c 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -22,8 +22,7 @@ class ABCIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) urls_info_json = self._search_regex( diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 54cec1c2f..8de9c11ea 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,8 +8,6 @@ from ..utils import ( determine_ext, ExtractorError, qualities, - compat_urllib_parse_urlparse, - compat_urllib_parse, int_or_none, parse_duration, unified_strdate, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 957d35979..c3d02f85e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -86,11 +86,15 @@ class ArteTVPlus7IE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + upload_date_str = player_info.get('shootingDate') + if not upload_date_str: + upload_date_str = player_info.get('VDA', '').split(' ')[0] + info_dict = { 'id': player_info['VID'], 'title': player_info['VTI'], 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index c569aa4d2..c13446665 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -15,13 +15,23 @@ class BandcampIE(InfoExtractor): _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', - 'file': '1812978515.mp3', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { - "title": "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", - "duration": 9.8485, + 'id': '1812978515', + 'ext': 'mp3', + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' + }, { + 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', + 'md5': '2b68e5851514c20efdff2afc5603b8b4', + 'info_dict': { + 'id': '2650410135', + 'ext': 'mp3', + 'title': 'Lanius (Battle)', + 'uploader': 'Ben Prunty Music', + }, }] def _real_extract(self, url): @@ -59,9 +69,9 @@ class BandcampIE(InfoExtractor): raise ExtractorError('No free songs found') download_link = m_download.group(1) - video_id = re.search( - r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', - webpage, re.MULTILINE | re.DOTALL).group('id') + video_id = self._search_regex( + r'var TralbumData = {.*?id: (?P<id>\d+),?$', + webpage, 'video id', flags=re.MULTILINE | re.DOTALL) download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') # We get the dictionary of the track from some javascript code diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 4e2960c62..2e277c8c3 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -26,6 +26,8 @@ class BRIE(InfoExtractor): 'title': 'Wenn das Traditions-Theater wackelt', 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', 'duration': 34, + 'uploader': 'BR', + 'upload_date': '20140802', } }, { @@ -66,8 +68,7 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') + display_id = self._match_id(url) page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 1bfc9f35b..2c0e5eea2 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,37 +4,61 @@ import re import json from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, +) class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)' - _TEST = { + _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', - 'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b', + 'md5': '33aa4ff477ecd124d18d7b5d23b87ce5', 'info_dict': { 'id': '2468056', 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', } - } + }, { + 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split("-")[-1] - embed_url = 'http://www.break.com/embed/%s' % video_id - webpage = self._download_webpage(embed_url, video_id) - info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>', - webpage, 'info json', flags=re.DOTALL) - info = json.loads(info_json) - video_url = info['videoUri'] + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.break.com/embed/%s' % video_id, video_id) + info = json.loads(self._search_regex( + r'var embedVars = ({.*})\s*?</script>', + webpage, 'info json', flags=re.DOTALL)) + youtube_id = info.get('youtubeId') if youtube_id: return self.url_result(youtube_id, 'Youtube') - final_url = video_url + '?' + info['AuthToken'] + formats = [{ + 'url': media['uri'] + '?' + info['AuthToken'], + 'tbr': media['bitRate'], + 'width': media['width'], + 'height': media['height'], + } for media in info['media']] + + if not formats: + formats.append({ + 'url': info['videoUri'] + }) + + self._sort_formats(formats) + + duration = int_or_none(info.get('videoLengthInSeconds')) + age_limit = parse_age_limit(info.get('audienceRating')) + return { 'id': video_id, - 'url': final_url, 'title': info['contentName'], 'thumbnail': info['thumbUri'], + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 65c12136a..d4227e6eb 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - 'duration': 1317, } } @@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor): thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", webpage, 'thumbnail', fatal=False) - duration = int_or_none(self._search_regex( - r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False)) return { 'id': video_id, 'title': video_title, 'formats': formats, - 'duration': duration, 'age_limit': self._rta_search(webpage), 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 60cab6f4e..450c7dfd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import base64 +import datetime import hashlib import json import netrc @@ -21,6 +22,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, @@ -136,6 +138,8 @@ class InfoExtractor(object): Unless mentioned otherwise, the fields should be Unicode strings. + Unless mentioned otherwise, None is equivalent to absence of information. + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -165,6 +169,14 @@ class InfoExtractor(object): return cls._VALID_URL_RE.match(url) is not None @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') + + @classmethod def working(cls): """Getter method for _WORKING.""" return cls._WORKING @@ -324,7 +336,11 @@ class InfoExtractor(object): try: return json.loads(json_string) except ValueError as ve: - raise ExtractorError('Failed to download JSON', cause=ve) + errmsg = '%s: Failed to parse JSON ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def report_warning(self, msg, video_id=None): idstr = '' if video_id is None else '%s: ' % video_id @@ -705,6 +721,34 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + def _live_title(self, name): + """ Generate the title for a live video """ + now = datetime.datetime.now() + now_str = now.strftime("%Y-%m-%d %H:%M") + return name + ' ' + now_str + + def _int(self, v, name, fatal=False, **kwargs): + res = int_or_none(v, **kwargs) + if 'get_attr' in kwargs: + print(getattr(v, kwargs['get_attr'])) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + + def _float(self, v, name, fatal=False, **kwargs): + res = float_or_none(v, **kwargs) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 66a8f16d9..dbcf5d6a7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -82,11 +82,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): ] def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - + video_id = self._match_id(url) url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information @@ -147,18 +143,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - view_count = self._search_regex( - r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False) - if view_count is not None: - view_count = str_to_int(view_count) + view_count = str_to_int(self._search_regex( + r'video_views_count[^>]+>\s+([\d\.,]+)', + webpage, 'view count', fatal=False)) + + title = self._og_search_title(webpage, default=None) + if title is None: + title = self._html_search_regex( + r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, + 'title') return { - 'id': video_id, + 'id': video_id, 'formats': formats, 'uploader': info['owner.screenname'], - 'upload_date': video_upload_date, - 'title': self._og_search_title(webpage), - 'subtitles': video_subtitles, + 'upload_date': video_upload_date, + 'title': title, + 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, 'view_count': view_count, diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 817a9bd61..5f24ac721 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor): video_id = mobj.group('id') fn = compat_urllib_parse_unquote(url_basename(url)) title = os.path.splitext(fn)[0] - video_url = ( - re.sub(r'[?&]dl=0', '', url) + - ('?' if '?' in url else '&') + 'dl=1') + video_url = re.sub(r'[?&]dl=0', '', url) + video_url += ('?' if '?' not in video_url else '&') + 'dl=1' return { 'id': video_id, diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 4ba323148..2cba82532 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,20 +9,20 @@ from ..utils import ExtractorError class EitbIE(InfoExtractor): - IE_NAME = u'eitb.tv' + IE_NAME = 'eitb.tv' _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' _TEST = { - u'add_ie': ['Brightcove'], - u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', - u'md5': u'edf4436247185adee3ea18ce64c47998', - u'info_dict': { - u'id': u'2743577154001', - u'ext': u'mp4', - u'title': u'60 minutos (Lasa y Zabala, 30 años)', + 'add_ie': ['Brightcove'], + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'md5': 'edf4436247185adee3ea18ce64c47998', + 'info_dict': { + 'id': '2743577154001', + 'ext': 'mp4', + 'title': '60 minutos (Lasa y Zabala, 30 años)', # All videos from eitb has this description in the brightcove info - u'description': u'.', - u'uploader': u'Euskal Telebista', + 'description': '.', + 'uploader': 'Euskal Telebista', }, } @@ -30,7 +32,7 @@ class EitbIE(InfoExtractor): webpage = self._download_webpage(url, chapter_id) bc_url = BrightcoveIE._extract_brightcove_url(webpage) if bc_url is None: - raise ExtractorError(u'Could not extract the Brightcove url') + raise ExtractorError('Could not extract the Brightcove url') # The BrightcoveExperience object doesn't contain the video id, we set # it manually bc_url += '&%40videoPlayer={0}'.format(chapter_id) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 522aa3d63..bb231ecb1 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)' _TEST = { 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', - 'md5': '3b427ae4b9d60619106de3185c2987cd', + 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { 'id': '95008', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', 'duration': 194, 'view_count': int, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 60e68d98a..3ad993751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'duration': 38, - 'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...', + 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', } }, { 'note': 'Video without discernible title', diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 721e5fce0..d966e8403 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor): }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', - 'md5': 'ff4d83318f89776ed0250634cfaa8d36', + 'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad', 'info_dict': { 'id': 'e402820827', 'ext': 'mp4', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 367f930dd..c16da70f1 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -155,7 +155,6 @@ class GenericIE(InfoExtractor): # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'md5': '7cf780be104d40fea7bae52eed4a470e', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', @@ -180,13 +179,13 @@ class GenericIE(InfoExtractor): # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { - 'id': '981', + 'id': '1969', 'ext': 'mp4', - 'title': 'My web playroom', - 'uploader': 'Ze Frank', - 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embeded Ustream video @@ -226,21 +225,6 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, - # smotri embed - { - 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', - 'md5': 'ec40048448e9284c9a1de77bb188108b', - 'info_dict': { - 'id': 'v27008541fad', - 'ext': 'mp4', - 'title': 'Крым и Севастополь вошли в состав России', - 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', - 'duration': 900, - 'upload_date': '20140318', - 'uploader': 'rbctv_2012_4', - 'uploader_id': 'rbctv_2012_4', - }, - }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -295,13 +279,13 @@ class GenericIE(InfoExtractor): { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { - 'id': 'jpSGZsgga_I', + 'id': '4vAffPZIT44', 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', - 'upload_date': '20130821', - 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, @@ -397,12 +381,6 @@ class GenericIE(InfoExtractor): }, ] - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning('Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) @@ -502,6 +480,7 @@ class GenericIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url) force_videoid = None + is_intentional = smuggled_data and smuggled_data.get('to_generic') if smuggled_data and 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid @@ -544,6 +523,9 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } + if not self._downloader.params.get('test', False) and not is_intentional: + self._downloader.report_warning('Falling back on generic information extractor.') + try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -657,6 +639,16 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) + # Look for embedded Dailymotion playlist player (#3822) + m = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) + if m: + playlists = re.findall( + r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) + if playlists: + return _playlist_from_matches( + playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + # Look for embedded Wistia player match = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py new file mode 100644 index 000000000..77c3ad4fc --- /dev/null +++ b/youtube_dl/extractor/globo.py @@ -0,0 +1,398 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import math + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + compat_str, + compat_chr, + compat_ord, +) + + +class GloboIE(InfoExtractor): + _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + + _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' + _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s' + + _VIDEOID_REGEXES = [ + r'\bdata-video-id="(\d+)"', + r'\bdata-player-videosids="(\d+)"', + r'<div[^>]+\bid="(\d+)"', + ] + + _RESIGN_EXPIRATION = 86400 + + _TESTS = [ + { + 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', + 'md5': '03ebf41cb7ade43581608b7d9b71fab0', + 'info_dict': { + 'id': '3654973', + 'ext': 'mp4', + 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', + 'duration': 251.585, + 'uploader': 'SporTV', + 'uploader_id': 698, + 'like_count': int, + } + }, + { + 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', + 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', + 'info_dict': { + 'id': '3607726', + 'ext': 'mp4', + 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', + 'duration': 103.204, + 'uploader': 'Globo.com', + 'uploader_id': 265, + 'like_count': int, + } + }, + { + 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', + 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', + 'info_dict': { + 'id': '3652183', + 'ext': 'mp4', + 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', + 'duration': 110.711, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + 'like_count': int, + } + }, + ] + + class MD5(): + HEX_FORMAT_LOWERCASE = 0 + HEX_FORMAT_UPPERCASE = 1 + BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' + BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' + PADDING = '=0xFF01DD' + hexcase = 0 + b64pad = '' + + def __init__(self): + pass + + class JSArray(list): + def __getitem__(self, y): + try: + return list.__getitem__(self, y) + except IndexError: + return 0 + + def __setitem__(self, i, y): + try: + return list.__setitem__(self, i, y) + except IndexError: + self.extend([0] * (i - len(self) + 1)) + self[-1] = y + + @classmethod + def hex_md5(cls, param1): + return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) + + @classmethod + def b64_md5(cls, param1, param2=None): + return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) + + @classmethod + def any_md5(cls, param1, param2): + return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) + + @classmethod + def rstr_md5(cls, param1): + return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) + + @classmethod + def rstr2hex(cls, param1): + _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' + _loc_3 = '' + for _loc_5 in range(0, len(param1)): + _loc_4 = compat_ord(param1[_loc_5]) + _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] + return _loc_3 + + @classmethod + def rstr2b64(cls, param1): + _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + _loc_3 = '' + _loc_4 = len(param1) + for _loc_5 in range(0, _loc_4, 3): + _loc_6_1 = compat_ord(param1[_loc_5]) << 16 + _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 + _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 + _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 + for _loc_7 in range(0, 4): + if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: + _loc_3 += cls.b64pad + else: + _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] + return _loc_3 + + @staticmethod + def rstr2any(param1, param2): + _loc_3 = len(param2) + _loc_4 = [] + _loc_9 = [0] * ((len(param1) >> 2) + 1) + for _loc_5 in range(0, len(_loc_9)): + _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) + + while len(_loc_9) > 0: + _loc_8 = [] + _loc_7 = 0 + for _loc_5 in range(0, len(_loc_9)): + _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] + _loc_6 = math.floor(_loc_7 / _loc_3) + _loc_7 -= _loc_6 * _loc_3 + if len(_loc_8) > 0 or _loc_6 > 0: + _loc_8[len(_loc_8)] = _loc_6 + + _loc_4[len(_loc_4)] = _loc_7 + _loc_9 = _loc_8 + + _loc_10 = '' + _loc_5 = len(_loc_4) - 1 + while _loc_5 >= 0: + _loc_10 += param2[_loc_4[_loc_5]] + _loc_5 -= 1 + + return _loc_10 + + @classmethod + def str2rstr_utf8(cls, param1, param2=None): + _loc_3 = '' + _loc_4 = -1 + if not param2: + param2 = cls.PADDING + param1 = param1 + param2[1:9] + while True: + _loc_4 += 1 + if _loc_4 >= len(param1): + break + _loc_5 = compat_ord(param1[_loc_4]) + _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 + if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: + _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) + _loc_4 += 1 + if _loc_5 <= 127: + _loc_3 += compat_chr(_loc_5) + continue + if _loc_5 <= 2047: + _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) + continue + if _loc_5 <= 65535: + _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( + 128 | _loc_5 & 63) + continue + if _loc_5 <= 2097151: + _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( + 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) + return _loc_3 + + @staticmethod + def rstr2binl(param1): + _loc_2 = [0] * ((len(param1) >> 2) + 1) + for _loc_3 in range(0, len(_loc_2)): + _loc_2[_loc_3] = 0 + for _loc_3 in range(0, len(param1) * 8, 8): + _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 + return _loc_2 + + @staticmethod + def binl2rstr(param1): + _loc_2 = '' + for _loc_3 in range(0, len(param1) * 32, 8): + _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) + return _loc_2 + + @classmethod + def binl_md5(cls, param1, param2): + param1 = cls.JSArray(param1) + param1[param2 >> 5] |= 128 << param2 % 32 + param1[(param2 + 64 >> 9 << 4) + 14] = param2 + _loc_3 = 1732584193 + _loc_4 = -271733879 + _loc_5 = -1732584194 + _loc_6 = 271733878 + for _loc_7 in range(0, len(param1), 16): + _loc_8 = _loc_3 + _loc_9 = _loc_4 + _loc_10 = _loc_5 + _loc_11 = _loc_6 + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) + _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) + _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) + _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) + _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) + _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) + _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) + _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) + _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) + _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) + _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) + _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) + _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) + _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) + _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) + _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) + _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) + _loc_3 = cls.safe_add(_loc_3, _loc_8) + _loc_4 = cls.safe_add(_loc_4, _loc_9) + _loc_5 = cls.safe_add(_loc_5, _loc_10) + _loc_6 = cls.safe_add(_loc_6, _loc_11) + return [_loc_3, _loc_4, _loc_5, _loc_6] + + @classmethod + def md5_cmn(cls, param1, param2, param3, param4, param5, param6): + return cls.safe_add( + cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) + + @classmethod + def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) + + @classmethod + def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) + + @classmethod + def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) + + @classmethod + def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): + return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) + + @classmethod + def safe_add(cls, param1, param2): + _loc_3 = (param1 & 65535) + (param2 & 65535) + _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) + return cls.lshift(_loc_4, 16) | _loc_3 & 65535 + + @classmethod + def bit_rol(cls, param1, param2): + return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) + + @staticmethod + def lshift(value, count): + r = (0xFFFFFFFF & value) << count + return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + + video = self._download_json( + self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + + title = video['title'] + duration = float_or_none(video['duration'], 1000) + like_count = video['likes'] + uploader = video['channel'] + uploader_id = video['channel_id'] + + formats = [] + + for resource in video['resources']: + resource_id = resource.get('_id') + if not resource_id: + continue + + security = self._download_json( + self._SECURITY_URL_TEMPLATE % (video_id, resource_id), + video_id, 'Downloading security hash for %s' % resource_id) + + security_hash = security.get('hash') + if not security_hash: + message = security.get('message') + if message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, message), expected=True) + continue + + hash_code = security_hash[:2] + received_time = int(security_hash[2:12]) + received_random = security_hash[12:22] + received_md5 = security_hash[22:] + + sign_time = received_time + self._RESIGN_EXPIRATION + padding = '%010d' % random.randint(1, 10000000000) + + signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) + signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + + formats.append({ + 'url': '%s?h=%s&k=%s' % (resource['url'], signed_hash, 'flash'), + 'format_id': resource_id, + 'height': resource['height'] + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'formats': formats + }
\ No newline at end of file diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py index 73bd6d890..363dc6608 100644 --- a/youtube_dl/extractor/godtube.py +++ b/youtube_dl/extractor/godtube.py @@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor): 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), video_id, 'Downloading player config XML') - video_url = config.find('.//file').text - uploader = config.find('.//author').text - timestamp = parse_iso8601(config.find('.//date').text) - duration = parse_duration(config.find('.//duration').text) - thumbnail = config.find('.//image').text + video_url = config.find('file').text + uploader = config.find('author').text + timestamp = parse_iso8601(config.find('date').text) + duration = parse_duration(config.find('duration').text) + thumbnail = config.find('image').text media = self._download_xml( 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') - title = media.find('.//title').text + title = media.find('title').text return { 'id': video_id, diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..53714f47f --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + determine_ext, +) + + +class GolemIE(InfoExtractor): + _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/' + _TEST = { + 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', + 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', + 'info_dict': { + 'id': '14095', + 'format_id': 'high', + 'ext': 'mp4', + 'title': 'iPhone 6 und 6 Plus - Test', + 'duration': 300.44, + 'filesize': 65309548, + } + } + + _PREFIX = 'http://video.golem.de' + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_xml( + 'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id) + + info = { + 'id': video_id, + 'title': config.findtext('./title', 'golem'), + 'duration': self._float(config.findtext('./playtime'), 'duration'), + } + + formats = [] + for e in config: + url = e.findtext('./url') + if not url: + continue + + formats.append({ + 'format_id': e.tag, + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'height': self._int(e.get('height'), 'height'), + 'width': self._int(e.get('width'), 'width'), + 'filesize': self._int(e.findtext('filesize'), 'filesize'), + 'ext': determine_ext(e.findtext('./filename')), + }) + self._sort_formats(formats) + info['formats'] = formats + + thumbnails = [] + for e in config.findall('.//teaser'): + url = e.findtext('./url') + if not url: + continue + thumbnails.append({ + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'width': self._int(e.get('width'), 'thumbnail width'), + 'height': self._int(e.get('height'), 'thumbnail height'), + }) + info['thumbnails'] = thumbnails + + return info diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index ca5f7c417..45cca1d24 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, determine_ext, compat_urllib_parse, compat_urllib_request, @@ -12,20 +13,22 @@ from ..utils import ( class GorillaVidIE(InfoExtractor): - IE_DESC = 'GorillaVid.in and daclips.in' + IE_DESC = 'GorillaVid.in, daclips.in and movpod.in' _VALID_URL = r'''(?x) https?://(?P<host>(?:www\.)? - (?:daclips\.in|gorillavid\.in))/ + (?:daclips\.in|gorillavid\.in|movpod\.in))/ (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? ''' + _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' + _TESTS = [{ 'url': 'http://gorillavid.in/06y9juieqpmi', 'md5': '5ae4a3580620380619678ee4875893ba', 'info_dict': { 'id': '06y9juieqpmi', 'ext': 'flv', - 'title': 'Rebecca Black My Moment Official Music Video Reaction', + 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -46,6 +49,9 @@ class GorillaVidIE(InfoExtractor): 'title': 'Micro Pig piglets ready on 16th July 2009', 'thumbnail': 're:http://.*\.jpg', }, + }, { + 'url': 'http://movpod.in/0wguyyxi1yca', + 'only_matching': True, }] def _real_extract(self, url): @@ -54,6 +60,9 @@ class GorillaVidIE(InfoExtractor): webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id) + if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + fields = dict(re.findall(r'''(?x)<input\s+ type="hidden"\s+ name="([^"]+)"\s+ @@ -69,14 +78,14 @@ class GorillaVidIE(InfoExtractor): webpage = self._download_webpage(req, video_id, 'Downloading video page') - title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title') - thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail') - url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url') + title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title') + video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url') + thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False) formats = [{ 'format_id': 'sd', - 'url': url, - 'ext': determine_ext(url), + 'url': video_url, + 'ext': determine_ext(video_url), 'quality': 1, }] diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py new file mode 100644 index 000000000..f97b1e085 --- /dev/null +++ b/youtube_dl/extractor/heise.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + get_meta_content, + parse_iso8601, +) + + +class HeiseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?heise\.de/video/artikel/ + .+?(?P<id>[0-9]+)\.html(?:$|[?#]) + ''' + _TEST = { + 'url': ( + 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' + ), + 'md5': 'ffed432483e922e88545ad9f2f15d30e', + 'info_dict': { + 'id': '2404147', + 'ext': 'mp4', + 'title': ( + "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" + ), + 'format_id': 'mp4_720', + 'timestamp': 1411812600, + 'upload_date': '20140927', + 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + json_url = self._search_regex( + r'json_url:\s*"([^"]+)"', webpage, 'json URL') + config = self._download_json(json_url, video_id) + + info = { + 'id': video_id, + 'thumbnail': config.get('poster'), + 'timestamp': parse_iso8601(get_meta_content('date', webpage)), + 'description': self._og_search_description(webpage), + } + + title = get_meta_content('fulltitle', webpage) + if title: + info['title'] = title + elif config.get('title'): + info['title'] = config['title'] + else: + info['title'] = self._og_search_title(webpage) + + formats = [] + for t, rs in config['formats'].items(): + if not rs or not hasattr(rs, 'items'): + self._downloader.report_warning( + 'formats: {0}: no resolutions'.format(t)) + continue + + for height_str, obj in rs.items(): + format_id = '{0}_{1}'.format(t, height_str) + + if not obj or not obj.get('url'): + self._downloader.report_warning( + 'formats: {0}: no url'.format(format_id)) + continue + + formats.append({ + 'url': obj['url'], + 'format_id': format_id, + 'height': self._int(height_str, 'height'), + }) + + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 12e9e61c4..c80185b53 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,7 +89,12 @@ class IGNIE(InfoExtractor): '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: - return [self.url_result(u, ie='IGN') for u in multiple_urls] + entries = [self.url_result(u, ie='IGN') for u in multiple_urls] + return { + '_type': 'playlist', + 'id': name_or_id, + 'entries': entries, + } video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 4ddda2f1b..53f9a5f75 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -12,12 +14,13 @@ class InternetVideoArchiveIE(InfoExtractor): _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' _TEST = { - u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', - u'file': u'452693.mp4', - u'info_dict': { - u'title': u'SKYFALL', - u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - u'duration': 153, + 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'info_dict': { + 'id': '452693', + 'ext': 'mp4', + 'title': 'SKYFALL', + 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + 'duration': 149, }, } @@ -42,7 +45,7 @@ class InternetVideoArchiveIE(InfoExtractor): url = self._build_url(query) flashconfiguration = self._download_xml(url, video_id, - u'Downloading flash configuration') + 'Downloading flash configuration') file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality @@ -51,7 +54,7 @@ class InternetVideoArchiveIE(InfoExtractor): lambda m: self._clean_query(m.group()), file_url) info = self._download_xml(file_url, video_id, - u'Downloading video info') + 'Downloading video info') item = info.find('channel/item') def _bp(p): diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index a83dd249f..07ef682ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + thumbnail = self._proto_relative_url( + self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( r"adduserUsername\s*=\s*'([^']+)';", diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py index aad782578..122e2dd8c 100644 --- a/youtube_dl/extractor/jpopsukitv.py +++ b/youtube_dl/extractor/jpopsukitv.py @@ -1,8 +1,6 @@ # coding=utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -12,14 +10,14 @@ from ..utils import ( class JpopsukiIE(InfoExtractor): IE_NAME = 'jpopsuki.tv' - _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' + _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P<id>\S+)' _TEST = { 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', 'md5': '88018c0c1a9b1387940e90ec9e7e198e', - 'file': '00be659d23b0b40508169cdee4545771.mp4', 'info_dict': { 'id': '00be659d23b0b40508169cdee4545771', + 'ext': 'mp4', 'title': 'ayumi hamasaki - evolution', 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', @@ -30,8 +28,7 @@ class JpopsukiIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -47,11 +44,9 @@ class JpopsukiIE(InfoExtractor): uploader_id = self._html_search_regex( r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)', webpage, 'video uploader_id', fatal=False) - upload_date = self._html_search_regex( + upload_date = unified_strdate(self._html_search_regex( r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', - fatal=False) - if upload_date is not None: - upload_date = unified_strdate(upload_date) + fatal=False)) view_count_str = self._html_search_regex( r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count', fatal=False) diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index 9b553b9fa..5aa32bf09 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -11,10 +11,9 @@ from ..utils import ( class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html' _TEST = { 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'md5': '1574e9b4d6438446d5b7dbcdf2786276', 'info_dict': { 'id': 'r303r', 'ext': 'flv', @@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = self._match_id(url) html = self._download_webpage(url, video_id) iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url')) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py new file mode 100644 index 000000000..fca0bfef0 --- /dev/null +++ b/youtube_dl/extractor/lrt.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + parse_duration, + remove_end, +) + + +class LRTIE(InfoExtractor): + IE_NAME = 'lrt.lt' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', + 'info_dict': { + 'id': '54391', + 'ext': 'mp4', + 'title': 'Septynios Kauno dienos', + 'description': 'Kauno miesto ir apskrities naujienos', + 'duration': 1783, + }, + 'params': { + 'skip_download': True, # HLS download + }, + + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - LRT') + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + duration = parse_duration(self._search_regex( + r"'duration':\s*'([^']+)',", webpage, + 'duration', fatal=False, default=None)) + + formats = [] + for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): + data = json.loads(js_to_json(js)) + if data['provider'] == 'rtmp': + formats.append({ + 'format_id': 'rtmp', + 'ext': determine_ext(data['file']), + 'url': data['streamer'], + 'play_path': 'mp4:%s' % data['file'], + 'preference': -1, + }) + else: + formats.extend( + self._extract_m3u8_formats(data['file'], video_id, 'mp4')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + } diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index 3a938861b..c7f6beb9c 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import json from .common import InfoExtractor @@ -23,6 +22,7 @@ class MuenchenTVIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, @@ -33,9 +33,7 @@ class MuenchenTVIE(InfoExtractor): display_id = 'live' webpage = self._download_webpage(url, display_id) - now = datetime.datetime.now() - now_str = now.strftime("%Y-%m-%d %H:%M") - title = self._og_search_title(webpage) + ' ' + now_str + title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( r'(?s)\nplaylist:\s*(\[.*?}\]),related:', @@ -73,5 +71,6 @@ class MuenchenTVIE(InfoExtractor): 'title': title, 'formats': formats, 'is_live': True, + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 963c4587c..cc7c921c3 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + compat_urllib_parse_urlparse, int_or_none, remove_end, ) @@ -13,76 +14,116 @@ from ..utils import ( class NFLIE(InfoExtractor): IE_NAME = 'nfl.com' - _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' - _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' - _TEST = { - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates - 'info_dict': { - 'id': '0ap3000000398478', - 'ext': 'mp4', - 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, - 'thumbnail': 're:^https?://.*\.jpg$', + _VALID_URL = r'''(?x)https?:// + (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ + (?:.+?/)* + (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' + _TESTS = [ + { + 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', + 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'info_dict': { + 'id': '0ap3000000398478', + 'ext': 'mp4', + 'title': 'Week 3: Redskins vs. Eagles highlights', + 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', + 'upload_date': '20140921', + 'timestamp': 1411337580, + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'info_dict': { + 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'ext': 'mp4', + 'title': 'LIVE: Post Game vs. Browns', + 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', + 'upload_date': '20131229', + 'timestamp': 1388354455, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + ] + + @staticmethod + def prepend_host(host, url): + if not url.startswith('http'): + if not url.startswith('/'): + url = '/%s' % url + url = 'http://{0:}{1:}'.format(host, url) + return url + + @staticmethod + def format_from_stream(stream, protocol, host, path_prefix='', + preference=0, note=None): + url = '{protocol:}://{host:}/{prefix:}{path:}'.format( + protocol=protocol, + host=host, + prefix=path_prefix, + path=stream.get('path'), + ) + return { + 'url': url, + 'vbr': int_or_none(stream.get('rate', 0), 1000), + 'preference': preference, + 'format_note': note, } - } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id, host = mobj.group('id'), mobj.group('host') - config = self._download_json(self._PLAYER_CONFIG_URL, video_id, - note='Downloading player config') - url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) - video_data = self._download_json(url_template.format(id=video_id), video_id) + webpage = self._download_webpage(url, video_id) - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) + config_url = NFLIE.prepend_host(host, self._search_regex( + r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL')) + config = self._download_json(config_url, video_id, + note='Downloading player config') + url_template = NFLIE.prepend_host( + host, '{contentURLTemplate:}'.format(**config)) + video_data = self._download_json( + url_template.format(id=video_id), video_id) formats = [] - streams = video_data.get('cdnData', {}).get('bitrateInfo', []) - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue - - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - - path_prefix = cdn.get('pathprefix', '') - if path_prefix and not path_prefix.endswith('/'): - path_prefix = '%s/' % path_prefix - - get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=p, - ) - - if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): - preference = -1 - else: - preference = 0 - + cdn_data = video_data.get('cdnData', {}) + streams = cdn_data.get('bitrateInfo', []) + if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': + parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) + protocol, host = parts.scheme, parts.netloc for stream in streams: - path = stream.get('path') - if not path: + formats.append( + NFLIE.format_from_stream(stream, protocol, host)) + else: + cdns = config.get('cdns') + if not cdns: + raise ExtractorError('Failed to get CDN data', expected=True) + + for name, cdn in cdns.items(): + # LimeLight streams don't seem to work + if cdn.get('name') == 'LIMELIGHT': continue - formats.append({ - 'url': get_url(path), - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': name, - }) + protocol = cdn.get('protocol') + host = remove_end(cdn.get('host', ''), '/') + if not (protocol and host): + continue + + prefix = cdn.get('pathprefix', '') + if prefix and not prefix.endswith('/'): + prefix = '%s/' % prefix + + preference = 0 + if protocol == 'rtmp': + preference = -2 + elif 'prog' in name.lower(): + preference = 1 + + for stream in streams: + formats.append( + NFLIE.format_from_stream(stream, protocol, host, + prefix, preference, name)) self._sort_formats(formats) @@ -94,7 +135,7 @@ class NFLIE(InfoExtractor): return { 'id': video_id, - 'title': video_data.get('storyHeadline'), + 'title': video_data.get('headline'), 'formats': formats, 'description': video_data.get('caption'), 'duration': video_data.get('duration'), diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py new file mode 100644 index 000000000..4a41c0542 --- /dev/null +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OktoberfestTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)' + + _TEST = { + 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', + 'info_dict': { + 'id': 'hb-zelt', + 'ext': 'mp4', + 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 're:^https?://.*\.jpg$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._live_title(self._html_search_regex( + r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) + + clip = self._search_regex( + r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') + ncurl = self._search_regex( + r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base') + video_url = ncurl + clip + thumbnail = self._search_regex( + r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage, + 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'is_live': True, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 2adfde909..8f140d626 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + unified_strdate, US_RATINGS, ) @@ -11,10 +12,10 @@ from ..utils import ( class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: - # Direct video URL - video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | - # Article with embedded player - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | + # Direct video URL + video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + # Article with embedded player (or direct video) + (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) @@ -65,10 +66,25 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', } + }, + { + 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', + 'md5': '908f3e5473a693b266b84e25e1cf9703', + 'info_dict': { + 'id': '2365160389', + 'display_id': 'killer-typhoon', + 'ext': 'mp4', + 'description': 'md5:c741d14e979fc53228c575894094f157', + 'title': 'Killer Typhoon', + 'duration': 3172, + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140122', + } } + ] - def _extract_ids(self, url): + def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) presumptive_id = mobj.group('presumptive_id') @@ -76,15 +92,20 @@ class PBSIE(InfoExtractor): if presumptive_id: webpage = self._download_webpage(url, display_id) + upload_date = unified_strdate(self._search_regex( + r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', + webpage, 'upload date', default=None)) + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer + r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer ] media_id = self._search_regex( MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: - return media_id, presumptive_id + return media_id, presumptive_id, upload_date url = self._search_regex( r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', @@ -104,10 +125,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id + return video_id, display_id, None def _real_extract(self, url): - video_id, display_id = self._extract_ids(url) + video_id, display_id, upload_date = self._extract_webpage(url) info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) @@ -119,6 +140,7 @@ class PBSIE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'title': info['title'], 'url': info['alternate_encoding']['url'], 'ext': 'mp4', @@ -126,4 +148,5 @@ class PBSIE(InfoExtractor): 'thumbnail': info.get('image_url'), 'duration': info.get('duration'), 'age_limit': age_limit, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py new file mode 100644 index 000000000..596c621d7 --- /dev/null +++ b/youtube_dl/extractor/planetaplay.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class PlanetaPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)' + _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}' + _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}' + _TEST = { + 'url': 'http://planetaplay.com/?sng=3586', + 'md5': '9d569dceb7251a4e01355d5aea60f9db', + 'info_dict': { + 'id': '3586', + 'ext': 'flv', + 'title': 'md5:e829428ee28b1deed00de90de49d1da1', + } + } + + _SONG_FORMATS = { + 'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'), + 'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'), + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + response = self._download_json( + self._API_URL.format(video_id), video_id)['response'] + try: + data = response.get('data')[0] + except IndexError: + raise ExtractorError( + '%s: failed to get the playlist' % self.IE_NAME, expected=True) + + title = '{song_artists:} - {sng_name:}'.format(**data) + thumbnail = self._THUMBNAIL_URL.format(**data) + + formats = [] + for format_id, (quality, url_template) in self._SONG_FORMATS.items(): + formats.append({ + 'format_id': format_id, + 'url': url_template.format(**data), + 'quality': quality, + 'ext': 'flv', + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py new file mode 100644 index 000000000..645a1e06d --- /dev/null +++ b/youtube_dl/extractor/played.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import os.path + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, +) + + +class PlayedIE(InfoExtractor): + IE_NAME = 'played.to' + _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)' + + _TEST = { + 'url': 'http://played.to/j2f2sfiiukgt', + 'md5': 'c2bd75a368e82980e7257bf500c00637', + 'info_dict': { + 'id': 'j2f2sfiiukgt', + 'ext': 'flv', + 'title': 'youtube-dl_test_video.mp4', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + orig_webpage = self._download_webpage(url, video_id) + fields = re.findall( + r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) + data = dict(fields) + + self._sleep(2, video_id) + + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage( + req, video_id, note='Downloading video page ...') + + title = os.path.splitext(data['fname'])[0] + + video_url = self._search_regex( + r'file: "?(.+?)",', webpage, 'video URL') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 5b2a723c1..619496de7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -144,7 +144,7 @@ class ProSiebenSat1IE(InfoExtractor): 'id': '2156342', 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', - 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528', + 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.', 'duration': 307.24, }, 'params': { @@ -180,12 +180,10 @@ class ProSiebenSat1IE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - page = self._download_webpage(url, video_id, 'Downloading page') - - clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id') + clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'testclient' client_name = 'kolibri-1.2.5' @@ -234,12 +232,12 @@ class ProSiebenSat1IE(InfoExtractor): urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, page, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(page) + title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') + description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None)) + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) formats = [] diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 2007a0013..94602e89e 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -9,7 +9,6 @@ from ..utils import ( compat_urllib_parse, unified_strdate, str_to_int, - int_or_none, ) from ..aes import aes_decrypt_text @@ -40,31 +39,42 @@ class SpankwireIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') + title = self._html_search_regex( + r'<h1>([^<]+)', webpage, 'title') description = self._html_search_regex( - r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False) + r'<div\s+id="descriptionContent">([^<]+)<', + webpage, 'description', fatal=False) thumbnail = self._html_search_regex( - r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False) + r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', + webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( - r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False) + r'by:\s*<a [^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False) - upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False) - if upload_date: - upload_date = unified_strdate(upload_date) - - view_count = self._html_search_regex( - r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False) - if view_count: - view_count = str_to_int(view_count) - comment_count = int_or_none(self._html_search_regex( - r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False)) + r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', + webpage, 'uploader id', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + r'</a> on (.+?) at \d+:\d+', + webpage, 'upload date', fatal=False)) - video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) + view_count = str_to_int(self._html_search_regex( + r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', + webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>', + webpage, 'comment count', fatal=False)) + + video_urls = list(map( + compat_urllib_parse.unquote, + re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: - password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + password = self._html_search_regex( + r'flashvars\.video_title = "([^"]+)', + webpage, 'password').replace('+', ' ') + video_urls = list(map( + lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), + video_urls)) formats = [] for video_url in video_urls: diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 185353bef..abb827783 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor): 'info_dict': { 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', 'ext': 'mp4', - 'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', + 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', 'categories': ['Badminton'], 'view_count': int, 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE', + 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, 'upload_date': 're:^201408[23][0-9]$', }, diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 7de3c9dd5..263f09b46 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False)) + r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False)) + r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False)) diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py new file mode 100644 index 000000000..77e056242 --- /dev/null +++ b/youtube_dl/extractor/tapely.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + compat_urllib_request, + float_or_none, + parse_iso8601, +) + + +class TapelyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?' + _API_URL = 'http://tape.ly/showtape?id={0:}' + _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' + _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' + _TESTS = [ + { + 'url': 'http://tape.ly/my-grief-as-told-by-water', + 'info_dict': { + 'id': 23952, + 'title': 'my grief as told by water', + 'thumbnail': 're:^https?://.*\.png$', + 'uploader_id': 16484, + 'timestamp': 1411848286, + 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', + }, + 'playlist_count': 13, + }, + { + 'url': 'http://tape.ly/my-grief-as-told-by-water/1', + 'md5': '79031f459fdec6530663b854cbc5715c', + 'info_dict': { + 'id': 258464, + 'title': 'Dreaming Awake (My Brightest Diamond)', + 'ext': 'm4a', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + playlist_url = self._API_URL.format(display_id) + request = compat_urllib_request.Request(playlist_url) + request.add_header('X-Requested-With', 'XMLHttpRequest') + request.add_header('Accept', 'application/json') + + playlist = self._download_json(request, display_id) + + tape = playlist['tape'] + + entries = [] + for s in tape['songs']: + song = s['song'] + entry = { + 'id': song['id'], + 'duration': float_or_none(song.get('songduration'), 1000), + 'title': song['title'], + } + if song['source'] == 'S3': + entry.update({ + 'url': self._S3_SONG_URL.format(song['filename']), + }) + entries.append(entry) + elif song['source'] == 'YT': + self.to_screen('YouTube video detected') + yt_id = song['filename'].replace('/youtube/', '') + entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) + entries.append(entry) + elif song['source'] == 'SC': + self.to_screen('SoundCloud song detected') + sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) + entry.update(self.url_result(sc_url, 'Soundcloud')) + entries.append(entry) + else: + self.report_warning('Unknown song source: %s' % song['source']) + + if mobj.group('songnr'): + songnr = int(mobj.group('songnr')) - 1 + try: + return entries[songnr] + except IndexError: + raise ExtractorError( + 'No song with index: %s' % mobj.group('songnr'), + expected=True) + + return { + '_type': 'playlist', + 'id': tape['id'], + 'display_id': display_id, + 'title': tape['name'], + 'entries': entries, + 'thumbnail': tape.get('image_url'), + 'description': clean_html(tape.get('subtext')), + 'like_count': tape.get('likescount'), + 'uploader_id': tape.get('user_id'), + 'timestamp': parse_iso8601(tape.get('published_at')), + } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1cca47771..d5e28efad 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor): thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'], + 'title': talk_info['title'].strip(), 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py new file mode 100644 index 000000000..a77c6a2fc --- /dev/null +++ b/youtube_dl/extractor/thesixtyone.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class TheSixtyOneIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ + (?:.*?/)* + (?: + s| + song/comments/list| + song + )/(?P<id>[A-Za-z0-9]+)/?$''' + _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' + _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream' + _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' + _TESTS = [ + { + 'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', + 'md5': '821cc43b0530d3222e3e2b70bb4622ea', + 'info_dict': { + 'id': 'SrE3zD7s1jt', + 'ext': 'mp3', + 'title': 'CASIO - Unicorn War Mixtape', + 'thumbnail': 're:^https?://.*_desktop$', + 'upload_date': '20071217', + 'duration': 3208, + } + }, + { + 'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', + 'only_matching': True, + }, + { + 'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', + 'only_matching': True, + }, + { + 'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', + 'only_matching': True, + }, + { + 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', + 'only_matching': True, + }, + ] + + _DECODE_MAP = { + "x": "a", + "m": "b", + "w": "c", + "q": "d", + "n": "e", + "p": "f", + "a": "0", + "h": "1", + "e": "2", + "u": "3", + "s": "4", + "i": "5", + "o": "6", + "y": "7", + "r": "8", + "c": "9" + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + song_id = mobj.group('id') + + webpage = self._download_webpage( + self._SONG_URL_TEMPLATE.format(song_id), song_id) + + song_data = json.loads(self._search_regex( + r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data')) + keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] + url = self._SONG_FILE_URL_TEMPLATE.format( + "".join(reversed(keys)), **song_data) + + formats = [{ + 'format_id': 'sd', + 'url': url, + 'ext': 'mp3', + }] + + return { + 'id': song_id, + 'title': '{artist:} - {name:}'.format(**song_data), + 'formats': formats, + 'comment_count': song_data.get('comments_count'), + 'duration': song_data.get('play_time'), + 'like_count': song_data.get('score'), + 'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), + 'upload_date': unified_strdate(song_data.get('publish_date')), + } diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 607e947bb..496f15d80 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # extract download link from mobile player page webpage_player = self._download_webpage( @@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor): 'description': description, 'upload_date': upload_date } + + +class THVideoPlaylistIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://thvideo.tv/mylist2', + 'info_dict': { + 'id': '2', + 'title': '幻想万華鏡', + }, + 'playlist_mincount': 23, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + list_title = self._html_search_regex( + r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title', + fatal=False) + + entries = [ + self.url_result('http://thvideo.tv/v/th' + id, 'THVideo') + for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)] + + return self.playlist_result(entries, playlist_id, list_title) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index dc8697850..27962b5fe 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -17,16 +17,16 @@ class TvigleIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.tvigle.ru/video/brat-2/', - 'md5': '72cb7eab33e54314e1790da402d3c9c3', + 'url': 'http://www.tvigle.ru/video/brat/', + 'md5': 'ff4344a4894b0524441fb6f8218dc716', 'info_dict': { - 'id': '5119390', - 'display_id': 'brat-2', + 'id': '5118490', + 'display_id': 'brat', 'ext': 'mp4', - 'title': 'Брат 2 ', - 'description': 'md5:5751f4fe345a58e1692585c361294bd8', - 'duration': 7356.369, - 'age_limit': 0, + 'title': 'Брат', + 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb', + 'duration': 5722.6, + 'age_limit': 16, }, }, { @@ -71,6 +71,7 @@ class TvigleIE(InfoExtractor): 'format_id': '%s-%s' % (vcodec, quality), 'vcodec': vcodec, 'height': int(quality[:-1]), + 'filesize': item['video_files_size'][vcodec][quality], }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index ebab8b86c..5b1a3ec78 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_HTTPError, compat_urllib_request, ExtractorError, ) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 7d27d6c57..964470070 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -31,7 +31,7 @@ class VGTVIE(InfoExtractor): 'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen', 'info_dict': { 'id': '100764', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', 'thumbnail': 're:^https?://.*\.jpg', @@ -50,7 +50,7 @@ class VGTVIE(InfoExtractor): 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen', 'info_dict': { 'id': '100015', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!', 'description': 'md5:9a60cc23fa349f761628924e56eeec2d', 'thumbnail': 're:^https?://.*\.jpg', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bc01d7fbf..d2c36b58a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,17 +8,19 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, - clean_html, - get_element_by_attribute, + compat_urlparse, ExtractorError, + get_element_by_attribute, + InAdvancePagedList, + int_or_none, RegexNotFoundError, std_headers, unsmuggle_url, urlencode_postdata, - int_or_none, ) @@ -89,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:380943ec71b89736ff4bf27183233d09', 'duration': 1595, }, }, @@ -103,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, + 'description': None, }, }, { @@ -117,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, + 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -203,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id @@ -273,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description - video_description = None - try: - video_description = get_element_by_attribute("class", "description_wrapper", webpage) - if video_description: - video_description = clean_html(video_description) - except AssertionError as err: - # On some pages like (http://player.vimeo.com/video/54469442) the - # html tags are not closed, python 2.6 cannot handle it - if err.args[0] == 'we should not get here!': - pass - else: - raise + + video_description = self._html_search_regex( + r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', + webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_meta( + 'description', webpage, default=None) + if not video_description and mobj.group('pro'): + orig_webpage = self._download_webpage( + orig_url, video_id, + note='Downloading webpage for description', + fatal=False) + if orig_webpage: + video_description = self._html_search_meta( + 'description', orig_webpage, default=None) + if not video_description and not mobj.group('player'): + self._downloader.report_warning('Cannot find video description') # Extract video duration video_duration = int_or_none(config["video"].get("duration")) @@ -529,3 +540,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): def _real_extract(self, url): return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') + + +class VimeoLikesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' + IE_NAME = 'vimeo:likes' + IE_DESC = 'Vimeo user likes' + _TEST = { + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, + "info_dict": { + "description": "See all the videos urza likes", + "title": 'Videos urza likes', + }, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + webpage = self._download_webpage(url, user_id) + page_count = self._int( + self._search_regex( + r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> + .*?</a></li>\s*<li\s+class="pagination_next"> + ''', webpage, 'page count'), + 'page count', fatal=True) + PAGE_SIZE = 12 + title = self._html_search_regex( + r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) + description = self._html_search_meta('description', webpage) + + def _get_page(idx): + page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( + self.http_scheme(), user_id, idx + 1) + webpage = self._download_webpage( + page_url, user_id, + note='Downloading page %d/%d' % (idx + 1, page_count)) + video_list = self._search_regex( + r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', + webpage, 'video content') + paths = re.findall( + r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) + for path in paths: + yield { + '_type': 'url', + 'url': compat_urlparse.urljoin(page_url, path), + } + + pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': 'user%s_likes' % user_id, + 'title': title, + 'description': description, + 'entries': pl, + } diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index fb0600f1a..ec3c010ad 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, + ExtractorError, parse_duration, qualities, ) @@ -14,13 +15,12 @@ class VuClipIE(InfoExtractor): _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434', - 'md5': '92ac9d1ccefec4f0bb474661ab144fcf', + 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', 'info_dict': { - 'id': '843902317', + 'id': '922692425', 'ext': '3gp', - 'title': 'Movie Trailer: Noah', - 'duration': 139, + 'title': 'The Toy Soldiers - Hollywood Movie Trailer', + 'duration': 180, } } @@ -37,16 +37,32 @@ class VuClipIE(InfoExtractor): webpage = self._download_webpage( adfree_url, video_id, note='Download post-ad page') + error_msg = self._html_search_regex( + r'<p class="message">(.*?)</p>', webpage, 'error message', + default=None) + if error_msg: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_msg), expected=True) + + # These clowns alternate between two page types links_code = self._search_regex( - r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage, - 'links') + r'''(?xs) + (?: + <img\s+src="/im/play.gif".*?>| + <!--\ player\ end\ -->\s*</div><!--\ thumb\ end--> + ) + (.*?) + (?: + <a\s+href="fblike|<div\s+class="social"> + ) + ''', webpage, 'links') title = self._html_search_regex( r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip() quality_order = qualities(['Reg', 'Hi']) formats = [] for url, q in re.findall( - r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code): + r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code): format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q formats.append({ 'format_id': format_id, @@ -56,7 +72,7 @@ class VuClipIE(InfoExtractor): self._sort_formats(formats) duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False)) + r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 4e89acd81..bda3870db 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor): "info_dict": { "id": "wshh6a7q1ny0G34ZwuIO", "ext": "mp4", - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" + "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage_src = self._download_webpage(url, video_id) - - m_vevo_id = re.search(r'videoId=(.*?)&?', - webpage_src) + m_vevo_id = re.search(r'videoId=(.*?)&?', webpage) if m_vevo_id is not None: return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') video_url = self._search_regex( - r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL') + r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL') if 'youtube' in video_url: return self.url_result(video_url, ie='Youtube') video_title = self._html_search_regex( - r"<title>(.*)</title>", webpage_src, 'title') + r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>', + webpage, 'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. thumbnail = self._html_search_regex( - r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail', + r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', fatal=False) if not thumbnail: - _title = r"""candytitles.*>(.*)</span>""" - mobj = re.search(_title, webpage_src) + _title = r'candytitles.*>(.*)</span>' + mobj = re.search(_title, webpage) if mobj is not None: video_title = mobj.group(1) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 3ab6017cd..221341c13 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -38,16 +38,6 @@ class YahooIE(InfoExtractor): }, }, { - 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', - 'md5': '410b7104aa9893b765bc22787a22f3d9', - 'info_dict': { - 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', - 'ext': 'mp4', - 'title': 'The World Loves Spider-Man', - 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', - } - }, - { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', 'md5': '60e8ac193d8fb71997caa8fce54c6460', 'info_dict': { diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 24872861a..944d7da38 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -13,7 +13,7 @@ class YnetIE(InfoExtractor): _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'md5': '002b44ee2f33d50363a1c153bed524cf', + 'md5': '4b29cb57c3dddd57642b3f051f535b07', 'info_dict': { 'id': 'L-11659-99244', 'ext': 'flv', @@ -22,7 +22,7 @@ class YnetIE(InfoExtractor): } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', - 'md5': '6455046ae1b48cf7e2b7cae285e53a16', + 'md5': '8194c2ea221e9a639cac96b6b0753dc5', 'info_dict': { 'id': 'L-8859-84418', 'ext': 'flv', @@ -33,9 +33,7 @@ class YnetIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 07ed7cbd1..48d47a245 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals -import json import math import random import re diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 99198e380..9041cfa87 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, int_or_none, - PagedList, + OnDemandPagedList, unescapeHTML, unified_strdate, orderedSet, @@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + pref_cookies = [ + c for c in self._downloader.cookiejar + if c.domain == '.youtube.com' and c.name == 'PREF'] + for pc in pref_cookies: + if 'hl=' in pc.value: + pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) + else: + if pc.value: + pc.value += '&' + pc.value += 'hl=en' video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor): 'id': video_id, 'title': title, } - url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) + url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 44dcb1e34..f651337ad 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None): for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: try: i = opts.index(private_opt) - opts[i+1] = '<PRIVATE>' + opts[i+1] = 'PRIVATE' except ValueError: pass return opts diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b644f4e92..d7ae5a90a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -673,6 +673,8 @@ class ExtractorError(Exception): expected = True if video_id is not None: msg = video_id + ': ' + msg + if cause: + msg += u' (caused by %r)' % cause if not expected: msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) @@ -799,6 +801,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): del req.headers['User-agent'] req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] del req.headers['Youtubedl-user-agent'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + return req def http_response(self, req, resp): @@ -884,7 +892,9 @@ def unified_strdate(date_str): '%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d %H:%M:%S', + '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', @@ -1384,14 +1394,16 @@ def check_executable(exe, args=[]): class PagedList(object): - def __init__(self, pagefunc, pagesize): - self._pagefunc = pagefunc - self._pagesize = pagesize - def __len__(self): # This is only useful for tests return len(self.getslice()) + +class OnDemandPagedList(PagedList): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + def getslice(self, start=0, end=None): res = [] for pagenum in itertools.count(start // self._pagesize): @@ -1430,6 +1442,35 @@ class PagedList(object): return res +class InAdvancePagedList(PagedList): + def __init__(self, pagefunc, pagecount, pagesize): + self._pagefunc = pagefunc + self._pagecount = pagecount + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + start_page = start // self._pagesize + end_page = ( + self._pagecount if end is None else (end // self._pagesize + 1)) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page = list(self._pagefunc(pagenum)) + if skip_elems: + page = page[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page) < only_more: + only_more -= len(page) + else: + page = page[:only_more] + res.extend(page) + break + res.extend(page) + return res + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( @@ -1534,33 +1575,37 @@ US_RATINGS = { } +def parse_age_limit(s): + if s is None: + return None + m = re.match(r'^(?P<age>\d{1,2})\+?$', s) + return int(m.group('age')) if m else US_RATINGS.get(s, None) + + def strip_jsonp(code): return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) def js_to_json(code): def fix_kv(m): - key = m.group(2) - if key.startswith("'"): - assert key.endswith("'") - assert '"' not in key - key = '"%s"' % key[1:-1] - elif not key.startswith('"'): - key = '"%s"' % key - - value = m.group(4) - if value.startswith("'"): - assert value.endswith("'") - assert '"' not in value - value = '"%s"' % value[1:-1] - - return m.group(1) + key + m.group(3) + value + v = m.group(0) + if v in ('true', 'false', 'null'): + return v + if v.startswith('"'): + return v + if v.startswith("'"): + v = v[1:-1] + v = re.sub(r"\\\\|\\'|\"", lambda m: { + '\\\\': '\\\\', + "\\'": "'", + '"': '\\"', + }[m.group(0)], v) + return '"%s"' % v res = re.sub(r'''(?x) - ([{,]\s*) - ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) - (:\s*) - ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) + "(?:[^"\\]*(?:\\\\|\\")?)*"| + '(?:[^'\\]*(?:\\\\|\\')?)*'| + [a-zA-Z_][a-zA-Z_0-9]* ''', fix_kv, code) res = re.sub(r',(\s*\])', lambda m: m.group(1), res) return res diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c17701d6a..4f0d486b9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.25' +__version__ = '2014.10.05.2' |