diff options
Diffstat (limited to 'youtube_dl')
38 files changed, 1716 insertions, 239 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 14a1d06ab..a671d6450 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -162,6 +162,7 @@ class YoutubeDL(object): default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. + extract_flat: Do not resolve URLs, return the immediate result. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -479,7 +480,10 @@ class YoutubeDL(object): return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) age_limit = self.params.get('age_limit') if age_limit is not None: - if age_limit < info_dict.get('age_limit', 0): + actual_age_limit = info_dict.get('age_limit') + if actual_age_limit is None: + actual_age_limit = 0 + if age_limit < actual_age_limit: return 'Skipping "' + title + '" because it is age restricted' if self.in_download_archive(info_dict): return '%s has already been recorded in archive' % video_title @@ -558,7 +562,12 @@ class YoutubeDL(object): Returns the resolved ie_result. """ - result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system + result_type = ie_result.get('_type', 'video') + + if self.params.get('extract_flat', False): + if result_type in ('url', 'url_transparent'): + return ie_result + if result_type == 'video': self.add_extra_info(ie_result, extra_info) return self.process_video_result(ie_result, download=download) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 80de211e7..a96bf9b5c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -69,6 +69,10 @@ __authors__ = ( 'Dobrosław Żybort', 'David Fabijan', 'Sebastian Haas', + 'Alexander Kirk', + 'Erik Johnson', + 'Keith Beckman', + 'Ole Ernst', ) __license__ = 'Public Domain' diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index f79e6a995..d01d1897e 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -27,8 +27,16 @@ class HttpFD(FileDownloader): headers['Youtubedl-user-agent'] = info_dict['user_agent'] if 'http_referer' in info_dict: headers['Referer'] = info_dict['http_referer'] - basic_request = compat_urllib_request.Request(url, None, headers) - request = compat_urllib_request.Request(url, None, headers) + add_headers = info_dict.get('http_headers') + if add_headers: + headers.update(add_headers) + data = info_dict.get('http_post_data') + http_method = info_dict.get('http_method') + basic_request = compat_urllib_request.Request(url, data, headers) + request = compat_urllib_request.Request(url, data, headers) + if http_method is not None: + basic_request.get_method = lambda: http_method + request.get_method = lambda: http_method is_test = self.params.get('test', False) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 27602e0c0..de6e8ee30 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -69,6 +69,7 @@ from .dfb import DFBIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .drtv import DRTVIE +from .dump import DumpIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE @@ -77,12 +78,17 @@ from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .eitb import EitbIE +from .ellentv import ( + EllenTVIE, + EllenTVClipsIE, +) from .elpais import ElPaisIE from .empflix import EmpflixIE from .engadget import EngadgetIE from .escapist import EscapistIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE +from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE @@ -110,7 +116,10 @@ from .freesound import FreesoundIE from .freespeech import FreespeechIE from .funnyordie import FunnyOrDieIE from .gamekings import GamekingsIE -from .gameone import GameOneIE +from .gameone import ( + GameOneIE, + GameOnePlaylistIE, +) from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gametrailers import GametrailersIE @@ -121,6 +130,7 @@ from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE +from .grooveshark import GroovesharkIE from .hark import HarkIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE @@ -147,6 +157,7 @@ from .ivi import ( from .izlesene import IzleseneIE from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .jpopsukitv import JpopsukiIE @@ -177,7 +188,9 @@ from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE +from .ministrygrid import MinistryGridIE from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE from .mixcloud import MixcloudIE from .mlb import MLBIE from .mpora import MporaIE @@ -187,6 +200,7 @@ from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE from .moviezine import MoviezineIE from .movshare import MovShareIE from .mtv import ( @@ -233,8 +247,10 @@ from .orf import ( ORFFM4IE, ) from .parliamentliveuk import ParliamentLiveUKIE +from .patreon import PatreonIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .playfm import PlayFMIE from .playvid import PlayvidIE from .podomatic import PodomaticIE from .pornhd import PornHdIE @@ -252,9 +268,10 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE +from .rtlnl import RtlXlIE from .rtlnow import RTLnowIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE +from .rtve import RTVEALaCartaIE, RTVELiveIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, @@ -265,6 +282,7 @@ from .rutube import ( from .rutv import RUTVIE from .sapo import SapoIE from .savefrom import SaveFromIE +from .sbs import SBSIE from .scivee import SciVeeIE from .screencast import ScreencastIE from .servingsys import ServingSysIE @@ -377,6 +395,7 @@ from .vuclip import VuClipIE from .vulture import VultureIE from .washingtonpost import WashingtonPostIE from .wat import WatIE +from .wayofthemaster import WayOfTheMasterIE from .wdr import ( WDRIE, WDRMobileIE, diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 7e93bc4df..748608826 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,5 +1,7 @@ #coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -13,13 +15,14 @@ class AparatIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' _TEST = { - u'url': u'http://www.aparat.com/v/wP8On', - u'file': u'wP8On.mp4', - u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1', - u'info_dict': { - u"title": u"تیم گلکسی 11 - زومیت", + 'url': 'http://www.aparat.com/v/wP8On', + 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', + 'info_dict': { + 'id': 'wP8On', + 'ext': 'mp4', + 'title': 'تیم گلکسی 11 - زومیت', }, - #u'skip': u'Extremely unreliable', + # 'skip': 'Extremely unreliable', } def _real_extract(self, url): @@ -29,8 +32,8 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' + - video_id + u'/vt/frame') + embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + + video_id + '/vt/frame') webpage = self._download_webpage(embed_url, video_id) video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d86dbba8e..1c72b2ff6 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -177,16 +177,26 @@ class ArteTVPlus7IE(InfoExtractor): # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' - _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' + _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/(?:magazine?/)?(?P<id>[^?#]+)' - _TEST = { + _TESTS = [{ 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', 'info_dict': { - 'id': '050489-002', + 'id': '72176', 'ext': 'mp4', - 'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design', + 'title': 'Folge 2 - Corporate Design', + 'upload_date': '20131004', }, - } + }, { + 'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', + 'info_dict': { + 'id': '160676', + 'ext': 'mp4', + 'title': 'Monty Python live (mostly)', + 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', + 'upload_date': '20140805', + } + }] class ArteTVFutureIE(ArteTVPlus7IE): diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index acfc4ad73..261ead98f 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -15,7 +15,7 @@ from ..utils import ( class BlipTVIE(SubtitlesInfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+]+)))' + _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_TESTS]+)))' _TESTS = [ { @@ -49,6 +49,21 @@ class BlipTVIE(SubtitlesInfoExtractor): 'uploader_id': '792887', 'duration': 279, } + }, + { + # https://bugzilla.redhat.com/show_bug.cgi?id=967465 + 'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', + 'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', + 'info_dict': { + 'id': '6573122', + 'ext': 'mov', + 'upload_date': '20130520', + 'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', + 'title': 'Red vs. Blue Season 11 Trailer', + 'timestamp': 1369029609, + 'uploader': 'redvsblue', + 'uploader_id': '792887', + } } ] @@ -150,7 +165,7 @@ class BlipTVIE(SubtitlesInfoExtractor): class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' _PAGE_SIZE = 12 IE_NAME = 'blip.tv:user' diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 419951b62..294670386 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -154,12 +154,14 @@ class BrightcoveIE(InfoExtractor): def _extract_brightcove_urls(cls, webpage): """Return a list of all Brightcove URLs from the webpage """ - url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage) + url_m = re.search( + r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"', + webpage) if url_m: url = unescapeHTML(url_m.group(1)) # Some sites don't add it, we can't download with this url, for example: # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ - if 'playerKey' in url: + if 'playerKey' in url or 'videoId' in url: return [url] matches = re.findall( @@ -188,9 +190,13 @@ class BrightcoveIE(InfoExtractor): referer = smuggled_data.get('Referer', url) return self._get_video_info( videoPlayer[0], query_str, query, referer=referer) - else: + elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) + else: + raise ExtractorError( + 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', + expected=True) def _get_video_info(self, video_id, query_str, query, referer=None): request_url = self._FEDERATED_URL_TEMPLATE % query_str @@ -202,6 +208,13 @@ class BrightcoveIE(InfoExtractor): req.add_header('Referer', referer) webpage = self._download_webpage(req, video_id) + error_msg = self._html_search_regex( + r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage, + 'error message', default=None) + if error_msg is not None: + raise ExtractorError( + 'brightcove said: %s' % error_msg, expected=True) + self.report_extraction(video_id) info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json') info = json.loads(info)['data'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 45a17f8ad..4d5b48167 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -84,6 +84,12 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * http_referer HTTP Referer header value to set. + * http_method HTTP method to use for the download. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * http_post_data Additional data to send with a POST + request. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -479,8 +485,9 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') - if secure: regexes = self._og_regexes('video:secure_url') + regexes + regexes = self._og_regexes('video') + self._og_regexes('video:url') + if secure: + regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) def _og_search_url(self, html, **kargs): diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py new file mode 100644 index 000000000..6b651778a --- /dev/null +++ b/youtube_dl/extractor/dump.py @@ -0,0 +1,39 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DumpIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?dump\.com/(?P<id>[a-zA-Z0-9]+)/' + + _TEST = { + 'url': 'http://www.dump.com/oneus/', + 'md5': 'ad71704d1e67dfd9e81e3e8b42d69d99', + 'info_dict': { + 'id': 'oneus', + 'ext': 'flv', + 'title': "He's one of us.", + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex( + r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL') + + thumb = self._og_search_thumbnail(webpage) + title = self._search_regex(r'<b>([^"]+)</b>', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumb, + } diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index 877113d63..63c2549d3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -1,19 +1,21 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import determine_ext class EbaumsWorldIE(InfoExtractor): _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)' _TEST = { - u'url': u'http://www.ebaumsworld.com/video/watch/83367677/', - u'file': u'83367677.mp4', - u'info_dict': { - u'title': u'A Giant Python Opens The Door', - u'description': u'This is how nightmares start...', - u'uploader': u'jihadpizza', + 'url': 'http://www.ebaumsworld.com/video/watch/83367677/', + 'info_dict': { + 'id': '83367677', + 'ext': 'mp4', + 'title': 'A Giant Python Opens The Door', + 'description': 'This is how nightmares start...', + 'uploader': 'jihadpizza', }, } @@ -28,7 +30,6 @@ class EbaumsWorldIE(InfoExtractor): 'id': video_id, 'title': config.find('title').text, 'url': video_url, - 'ext': determine_ext(video_url), 'description': config.find('description').text, 'thumbnail': config.find('image').text, 'uploader': config.find('username').text, diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py new file mode 100644 index 000000000..3e7923648 --- /dev/null +++ b/youtube_dl/extractor/ellentv.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) + + +class EllenTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)' + _TEST = { + 'url': 'http://www.ellentv.com/videos/0-7jqrsr18/', + 'md5': 'e4af06f3bf0d5f471921a18db5764642', + 'info_dict': { + 'id': '0-7jqrsr18', + 'ext': 'mp4', + 'title': 'What\'s Wrong with These Photos? A Whole Lot', + 'timestamp': 1406876400, + 'upload_date': '20140801', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + timestamp = parse_iso8601(self._search_regex( + r'<span class="publish-date"><time datetime="([^"]+)">', + webpage, 'timestamp')) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'url': self._html_search_meta('VideoURL', webpage, 'url'), + 'timestamp': timestamp, + } + + +class EllenTVClipsIE(InfoExtractor): + IE_NAME = 'EllenTV:clips' + _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)' + _TEST = { + 'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', + 'info_dict': { + 'id': 'meryl-streep-vanessa-hudgens', + 'title': 'Meryl Streep, Vanessa Hudgens', + }, + 'playlist_mincount': 9, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + webpage = self._download_webpage(url, playlist_id) + playlist = self._extract_playlist(webpage) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._og_search_title(webpage), + 'entries': self._extract_entries(playlist) + } + + def _extract_playlist(self, webpage): + json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') + try: + return json.loads("[{" + json_string + "}]") + except ValueError as ve: + raise ExtractorError('Failed to download JSON', cause=ve) + + def _extract_entries(self, playlist): + return [self.url_result(item['url'], 'EllenTV') for item in playlist] diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 272dfe1f6..476fc22b9 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,7 +36,7 @@ class EscapistIE(InfoExtractor): r'<meta name="description" content="([^"]*)"', webpage, 'description', fatal=False) - playerUrl = self._og_search_video_url(webpage, name=u'player URL') + playerUrl = self._og_search_video_url(webpage, name='player URL') title = self._html_search_regex( r'<meta name="title" content="([^"]*)"', diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py new file mode 100644 index 000000000..a38b773e8 --- /dev/null +++ b/youtube_dl/extractor/expotv.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class ExpoTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' + _TEST = { + 'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', + 'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', + 'info_dict': { + 'id': '17561', + 'ext': 'mp4', + 'upload_date': '20060212', + 'title': 'My Favorite Online Scrapbook Store', + 'view_count': int, + 'description': 'You\'ll find most everything you need at this virtual store front.', + 'uploader': 'Anna T.', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_key = self._search_regex( + r'<param name="playerKey" value="([^"]+)"', webpage, 'player key') + config_url = 'http://client.expotv.com/video/config/%s/%s' % ( + video_id, player_key) + config = self._download_json( + config_url, video_id, + note='Downloading video configuration') + + formats = [{ + 'url': fcfg['file'], + 'height': int_or_none(fcfg.get('height')), + 'format_note': fcfg.get('label'), + 'ext': self._search_regex( + r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'], + 'file extension', default=None), + } for fcfg in config['sources']] + self._sort_formats(formats) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = config.get('image') + view_count = int_or_none(self._search_regex( + r'<h5>Plays: ([0-9]+)</h5>', webpage, 'view counts')) + uploader = self._search_regex( + r'<div class="reviewer">\s*<img alt="([^"]+)"', webpage, 'uploader', + fatal=False) + upload_date = unified_strdate(self._search_regex( + r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', + fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'view_count': view_count, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py index b580f52fb..3022f539d 100644 --- a/youtube_dl/extractor/gameone.py +++ b/youtube_dl/extractor/gameone.py @@ -88,3 +88,28 @@ class GameOneIE(InfoExtractor): 'age_limit': age_limit, 'timestamp': timestamp, } + + +class GameOnePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$' + IE_NAME = 'gameone:playlist' + _TEST = { + 'url': 'http://www.gameone.de/tv', + 'info_dict': { + 'title': 'GameOne', + }, + 'playlist_mincount': 294, + } + + def _real_extract(self, url): + webpage = self._download_webpage('http://www.gameone.de/tv', 'TV') + max_id = max(map(int, re.findall(r'<a href="/tv/(\d+)"', webpage))) + entries = [ + self.url_result('http://www.gameone.de/tv/%d' % video_id, 'GameOne') + for video_id in range(max_id, 0, -1)] + + return { + '_type': 'playlist', + 'title': 'GameOne', + 'entries': entries, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8e915735e..8b11f7f7a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -15,11 +15,14 @@ from ..utils import ( compat_xml_parse_error, ExtractorError, + float_or_none, HEADRequest, + orderedSet, parse_xml, smuggle_url, unescapeHTML, unified_strdate, + unsmuggle_url, url_basename, ) from .brightcove import BrightcoveIE @@ -289,6 +292,58 @@ class GenericIE(InfoExtractor): 'description': 'Mario\'s life in the fast lane has never looked so good.', }, }, + # YouTube embed via <data-embed-url=""> + { + 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', + 'info_dict': { + 'id': 'jpSGZsgga_I', + 'ext': 'mp4', + 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'uploader': 'Gameloft', + 'uploader_id': 'gameloft', + 'upload_date': '20130821', + 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + }, + 'params': { + 'skip_download': True, + } + }, + # Camtasia studio + { + 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', + 'playlist': [{ + 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', + 'info_dict': { + 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', + 'ext': 'flv', + 'duration': 2235.90, + } + }, { + 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', + 'info_dict': { + 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', + 'ext': 'flv', + 'duration': 2235.93, + } + }], + 'info_dict': { + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', + } + }, + # Flowplayer + { + 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', + 'md5': '9d65602bf31c6e20014319c7d07fba27', + 'info_dict': { + 'id': '5123ea6d5e5a7', + 'ext': 'mp4', + 'age_limit': 18, + 'uploader': 'www.handjobhub.com', + 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', + } + } ] def report_download_webpage(self, video_id): @@ -301,58 +356,6 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def _send_head(self, url): - """Check if it is a redirect, like url shorteners, in case return the new url.""" - - class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """ - Subclass the HTTPRedirectHandler to make it use our - HEADRequest also on the redirected URL - """ - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - try: - # This function was deprecated in python 3.3 and removed in 3.4 - origin_req_host = req.get_origin_req_host() - except AttributeError: - origin_req_host = req.origin_req_host - return HEADRequest(newurl, - headers=newheaders, - origin_req_host=origin_req_host, - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - - class HTTPMethodFallback(compat_urllib_request.BaseHandler): - """ - Fallback to GET if HEAD is not allowed (405 HTTP error) - """ - def http_error_405(self, req, fp, code, msg, headers): - fp.read() - fp.close() - - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) - - # Build our opener - opener = compat_urllib_request.OpenerDirector() - for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: - opener.add_handler(handler()) - - response = opener.open(HEADRequest(url)) - if response is None: - raise ExtractorError('Invalid URL protocol') - return response - def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -372,6 +375,43 @@ class GenericIE(InfoExtractor): 'entries': entries, } + def _extract_camtasia(self, url, video_id, webpage): + """ Returns None if no camtasia video can be found. """ + + camtasia_cfg = self._search_regex( + r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', + webpage, 'camtasia configuration file', default=None) + if camtasia_cfg is None: + return None + + title = self._html_search_meta('DC.title', webpage, fatal=True) + + camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) + camtasia_cfg = self._download_xml( + camtasia_url, video_id, + note='Downloading camtasia configuration', + errnote='Failed to download camtasia configuration') + fileset_node = camtasia_cfg.find('./playlist/array/fileset') + + entries = [] + for n in fileset_node.getchildren(): + url_n = n.find('./uri') + if url_n is None: + continue + + entries.append({ + 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], + 'title': '%s - %s' % (title, n.tag), + 'url': compat_urlparse.urljoin(url, url_n.text), + 'duration': float_or_none(n.find('./duration').text), + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': title, + } + def _real_extract(self, url): if url.startswith('//'): return { @@ -408,17 +448,31 @@ class GenericIE(InfoExtractor): else: assert ':' in default_search return self.url_result(default_search + url) - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + + url, smuggled_data = unsmuggle_url(url) + force_videoid = None + if smuggled_data and 'force_videoid' in smuggled_data: + force_videoid = smuggled_data['force_videoid'] + video_id = force_videoid + else: + video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] self.to_screen('%s: Requesting header' % video_id) - try: - response = self._send_head(url) + head_req = HEADRequest(url) + response = self._request_webpage( + head_req, video_id, + note=False, errnote='Could not send HEAD request to %s' % url, + fatal=False) + if response is not False: # Check for redirect new_url = response.geturl() if url != new_url: self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url( + new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) # Check for direct link to a video @@ -439,10 +493,6 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } - except compat_urllib_error.HTTPError: - # This may be a stupid server that doesn't like HEAD, our UA, or so - pass - try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -460,6 +510,11 @@ class GenericIE(InfoExtractor): except compat_xml_parse_error: pass + # Is it a Camtasia project? + camtasia_res = self._extract_camtasia(url, video_id, webpage) + if camtasia_res is not None: + return camtasia_res + # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/rg3/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way @@ -475,10 +530,26 @@ class GenericIE(InfoExtractor): r'(?s)<title>(.*?)</title>', webpage, 'video title', default='video') + # Try to detect age limit automatically + age_limit = self._rta_search(webpage) + # And then there are the jokers who advertise that they use RTA, + # but actually don't. + AGE_LIMIT_MARKERS = [ + r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>', + ] + if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): + age_limit = 18 + # video uploader is domain name video_uploader = self._search_regex( r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') + # Helper method + def _playlist_from_matches(matches, getter, ie=None): + urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches) + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for BrightCove: bc_urls = BrightcoveIE._extract_brightcove_urls(webpage) if bc_urls: @@ -514,6 +585,7 @@ class GenericIE(InfoExtractor): matches = re.findall(r'''(?x) (?: <iframe[^>]+?src=| + data-video-url=| <embed[^>]+?src=| embedSWF\(?:\s* ) @@ -522,19 +594,15 @@ class GenericIE(InfoExtractor): (?:embed|v)/.+?) \1''', webpage) if matches: - urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') - for tuppl in matches] - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) + return _playlist_from_matches( + matches, lambda m: unescapeHTML(m[1]), ie='Youtube') # Look for embedded Dailymotion player matches = re.findall( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) if matches: - urlrs = [self.url_result(unescapeHTML(tuppl[1])) - for tuppl in matches] - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) + return _playlist_from_matches( + matches, lambda m: unescapeHTML(m[1])) # Look for embedded Wistia player match = re.search( @@ -553,7 +621,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) if mobj: return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage) + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) if mobj: return self.url_result(mobj.group(1), 'BlipTV') @@ -648,10 +716,8 @@ class GenericIE(InfoExtractor): # Look for funnyordie embed matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) if matches: - urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie') - for eurl in matches] - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) + return _playlist_from_matches( + matches, getter=unescapeHTML, ie='FunnyOrDie') # Look for embedded RUTV player rutv_url = RUTVIE._extract_url(webpage) @@ -713,6 +779,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Yahoo') + # Look for embedded sbs.com.au player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'SBS') + # Start with something easy: JW Player in SWFObject found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if not found: @@ -731,6 +804,15 @@ class GenericIE(InfoExtractor): # Broaden the findall a little bit: JWPlayer JS loader found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) if not found: + # Flow player + found = re.findall(r'''(?xs) + flowplayer\("[^"]+",\s* + \{[^}]+?\}\s*, + \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* + ["']?url["']?\s*:\s*["']([^"']+)["'] + ''', webpage) + assert found + if not found: # Try to find twitter cards info found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) if not found: @@ -739,7 +821,12 @@ class GenericIE(InfoExtractor): m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: - found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) + def check_video(vurl): + vpath = compat_urlparse.urlparse(vurl).path + return '.' in vpath and not vpath.endswith('.swf') + found = list(filter( + check_video, + re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))) if not found: # HTML5 video found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage) @@ -776,6 +863,7 @@ class GenericIE(InfoExtractor): 'url': video_url, 'uploader': video_uploader, 'title': video_title, + 'age_limit': age_limit, }) if len(entries) == 1: diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py new file mode 100644 index 000000000..726adff77 --- /dev/null +++ b/youtube_dl/extractor/grooveshark.py @@ -0,0 +1,190 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import math +import os.path +import re + + +from .common import InfoExtractor +from ..utils import ExtractorError, compat_urllib_request, compat_html_parser + +from ..utils import ( + compat_urllib_parse, + compat_urlparse, +) + + +class GroovesharkHtmlParser(compat_html_parser.HTMLParser): + def __init__(self): + self._current_object = None + self.objects = [] + compat_html_parser.HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + attrs = dict((k, v) for k, v in attrs) + if tag == 'object': + self._current_object = {'attrs': attrs, 'params': []} + elif tag == 'param': + self._current_object['params'].append(attrs) + + def handle_endtag(self, tag): + if tag == 'object': + self.objects.append(self._current_object) + self._current_object = None + + @classmethod + def extract_object_tags(cls, html): + p = cls() + p.feed(html) + p.close() + return p.objects + + +class GroovesharkIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)' + _TEST = { + 'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5', + 'md5': '7ecf8aefa59d6b2098517e1baa530023', + 'info_dict': { + 'id': '6SS1DW', + 'title': 'Jolene (Tenth Key Remix ft. Will Sessions)', + 'ext': 'mp3', + 'duration': 227, + } + } + + do_playerpage_request = True + do_bootstrap_request = True + + def _parse_target(self, target): + uri = compat_urlparse.urlparse(target) + hash = uri.fragment[1:].split('?')[0] + token = os.path.basename(hash.rstrip('/')) + return (uri, hash, token) + + def _build_bootstrap_url(self, target): + (uri, hash, token) = self._parse_target(target) + query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts) + return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token) + + def _build_meta_url(self, target): + (uri, hash, token) = self._parse_target(target) + query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts) + return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token) + + def _build_stream_url(self, meta): + return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None)) + + def _build_swf_referer(self, target, obj): + (uri, _, _) = self._parse_target(target) + return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None)) + + def _transform_bootstrap(self, js): + return re.split('(?m)^\s*try\s*{', js)[0] \ + .split(' = ', 1)[1].strip().rstrip(';') + + def _transform_meta(self, js): + return js.split('\n')[0].split('=')[1].rstrip(';') + + def _get_meta(self, target): + (meta_url, token) = self._build_meta_url(target) + self.to_screen('Metadata URL: %s' % meta_url) + + headers = {'Referer': compat_urlparse.urldefrag(target)[0]} + req = compat_urllib_request.Request(meta_url, headers=headers) + res = self._download_json(req, token, + transform_source=self._transform_meta) + + if 'getStreamKeyWithSong' not in res: + raise ExtractorError( + 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.') + + if res['getStreamKeyWithSong'] is None: + raise ExtractorError( + 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.', + expected=True) + + return res['getStreamKeyWithSong'] + + def _get_bootstrap(self, target): + (bootstrap_url, token) = self._build_bootstrap_url(target) + + headers = {'Referer': compat_urlparse.urldefrag(target)[0]} + req = compat_urllib_request.Request(bootstrap_url, headers=headers) + res = self._download_json(req, token, fatal=False, + note='Downloading player bootstrap data', + errnote='Unable to download player bootstrap data', + transform_source=self._transform_bootstrap) + return res + + def _get_playerpage(self, target): + (_, _, token) = self._parse_target(target) + + webpage = self._download_webpage( + target, token, + note='Downloading player page', + errnote='Unable to download player page', + fatal=False) + + if webpage is not None: + # Search (for example German) error message + error_msg = self._html_search_regex( + r'<div id="content">\s*<h2>(.*?)</h2>', webpage, + 'error message', default=None) + if error_msg is not None: + error_msg = error_msg.replace('\n', ' ') + raise ExtractorError('Grooveshark said: %s' % error_msg) + + if webpage is not None: + o = GroovesharkHtmlParser.extract_object_tags(webpage) + return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) + + return (webpage, None) + + def _real_initialize(self): + self.ts = int(time.time() * 1000) # timestamp in millis + + def _real_extract(self, url): + (target_uri, _, token) = self._parse_target(url) + + # 1. Fill cookiejar by making a request to the player page + swf_referer = None + if self.do_playerpage_request: + (_, player_objs) = self._get_playerpage(url) + if player_objs is not None: + swf_referer = self._build_swf_referer(url, player_objs[0]) + self.to_screen('SWF Referer: %s' % swf_referer) + + # 2. Ask preload.php for swf bootstrap data to better mimic webapp + if self.do_bootstrap_request: + bootstrap = self._get_bootstrap(url) + self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken']) + + # 3. Ask preload.php for track metadata. + meta = self._get_meta(url) + + # 4. Construct stream request for track. + stream_url = self._build_stream_url(meta) + duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000)) + post_dict = {'streamKey': meta['streamKey']['streamKey']} + post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8') + headers = { + 'Content-Length': len(post_data), + 'Content-Type': 'application/x-www-form-urlencoded' + } + if swf_referer is not None: + headers['Referer'] = swf_referer + + return { + 'id': token, + 'title': meta['song']['Name'], + 'http_method': 'POST', + 'url': stream_url, + 'ext': 'mp3', + 'format': 'mp3 audio', + 'duration': duration, + 'http_post_data': post_data, + 'http_headers': headers, + } diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py new file mode 100644 index 000000000..cf73cd753 --- /dev/null +++ b/youtube_dl/extractor/jove.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate +) + + +class JoveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)' + _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}' + _TESTS = [ + { + 'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current', + 'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b', + 'info_dict': { + 'id': '2744', + 'ext': 'mp4', + 'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', + 'description': 'md5:015dd4509649c0908bc27f049e0262c6', + 'thumbnail': 're:^https?://.*\.png$', + 'upload_date': '20110523', + } + }, + { + 'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', + 'md5': '914aeb356f416811d911996434811beb', + 'info_dict': { + 'id': '51796', + 'ext': 'mp4', + 'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', + 'description': 'md5:35ff029261900583970c4023b70f1dc9', + 'thumbnail': 're:^https?://.*\.png$', + 'upload_date': '20140802', + } + }, + + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + chapters_id = self._html_search_regex( + r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id') + + chapters_xml = self._download_xml( + self._CHAPTERS_URL.format(video_id=chapters_id), + video_id, note='Downloading chapters XML', + errnote='Failed to download chapters XML') + + video_url = chapters_xml.attrib.get('video') + if not video_url: + raise ExtractorError('Failed to get the video URL') + + title = self._html_search_meta('citation_title', webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>', + webpage, 'description', fatal=False) + publish_date = unified_strdate(self._html_search_meta( + 'citation_publication_date', webpage, 'publish date', fatal=False)) + comment_count = self._html_search_regex( + r'<meta name="num_comments" content="(\d+) Comments?"', + webpage, 'comment count', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'description': description, + 'upload_date': publish_date, + 'comment_count': comment_count, + } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 6436c05a3..1a896b536 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -9,6 +9,7 @@ from ..utils import ( compat_urllib_request, determine_ext, ExtractorError, + int_or_none, ) @@ -83,6 +84,21 @@ class MetacafeIE(InfoExtractor): 'skip_download': True, }, }, + # Movieclips.com video + { + 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', + 'info_dict': { + 'id': 'mv-Wy7ZU', + 'ext': 'mp4', + 'title': 'My Week with Marilyn - Do You Love Me?', + 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', + 'uploader': 'movie_trailers', + 'duration': 176, + }, + 'params': { + 'skip_download': 'requires rtmpdump', + } + } ] def report_disclaimer(self): @@ -134,6 +150,7 @@ class MetacafeIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) + video_url = None mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: mediaURL = compat_urllib_parse.unquote(mobj.group(1)) @@ -146,16 +163,17 @@ class MetacafeIE(InfoExtractor): else: gdaKey = mobj.group(1) video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: + if video_url is None: mobj = re.search(r'<video src="([^"]+)"', webpage) if mobj: video_url = mobj.group(1) video_ext = 'mp4' - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) + if video_url is None: + flashvars = self._search_regex( + r' name="flashvars" value="(.*?)"', webpage, 'flashvars', + default=None) + if flashvars: + vardict = compat_parse_qs(flashvars) if 'mediaData' not in vardict: raise ExtractorError('Unable to extract media URL') mobj = re.search( @@ -165,26 +183,68 @@ class MetacafeIE(InfoExtractor): mediaURL = mobj.group('mediaURL').replace('\\/', '/') video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) video_ext = determine_ext(video_url) - - video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title') + if video_url is None: + player_url = self._search_regex( + r"swfobject\.embedSWF\('([^']+)'", + webpage, 'config URL', default=None) + if player_url: + config_url = self._search_regex( + r'config=(.+)$', player_url, 'config URL') + config_doc = self._download_xml( + config_url, video_id, + note='Downloading video config') + smil_url = config_doc.find('.//properties').attrib['smil_file'] + smil_doc = self._download_xml( + smil_url, video_id, + note='Downloading SMIL document') + base_url = smil_doc.find('./head/meta').attrib['base'] + video_url = [] + for vn in smil_doc.findall('.//video'): + br = int(vn.attrib['system-bitrate']) + play_path = vn.attrib['src'] + video_url.append({ + 'format_id': 'smil-%d' % br, + 'url': base_url, + 'play_path': play_path, + 'page_url': url, + 'player_url': player_url, + 'ext': play_path.partition(':')[0], + }) + + if video_url is None: + raise ExtractorError('Unsupported video type') + + video_title = self._html_search_regex( + r'(?im)<title>(.*) - Video</title>', webpage, 'title') description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, 'uploader nickname', fatal=False) + duration = int_or_none( + self._html_search_meta('video:duration', webpage)) + + age_limit = ( + 18 + if re.search(r'"contentRating":"restricted"', webpage) + else 0) - if re.search(r'"contentRating":"restricted"', webpage) is not None: - age_limit = 18 + if isinstance(video_url, list): + formats = video_url else: - age_limit = 0 + formats = [{ + 'url': video_url, + 'ext': video_ext, + }] + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, 'description': description, 'uploader': video_uploader, 'title': video_title, - 'thumbnail':thumbnail, - 'ext': video_ext, + 'thumbnail': thumbnail, 'age_limit': age_limit, + 'formats': formats, + 'duration': duration, } diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py new file mode 100644 index 000000000..949ad11db --- /dev/null +++ b/youtube_dl/extractor/ministrygrid.py @@ -0,0 +1,57 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + smuggle_url, +) + + +class MinistryGridIE(InfoExtractor): + _VALID_URL = r'https?://www\.ministrygrid.com/([^/?#]*/)*(?P<id>[^/#?]+)/?(?:$|[?#])' + + _TEST = { + 'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', + 'md5': '844be0d2a1340422759c2a9101bab017', + 'info_dict': { + 'id': '3453494717001', + 'ext': 'mp4', + 'title': 'The Gospel by Numbers', + 'description': 'Coming soon from T4G 2014!', + 'uploader': 'LifeWay Christian Resources (MG)', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + portlets_json = self._search_regex( + r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list') + portlets = json.loads(portlets_json) + pl_id = self._search_regex( + r'<!--\s*p_l_id - ([0-9]+)<br>', webpage, 'p_l_id') + + for i, portlet in enumerate(portlets): + portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet) + portlet_code = self._download_webpage( + portlet_url, video_id, + note='Looking in portlet %s (%d/%d)' % (portlet, i + 1, len(portlets)), + fatal=False) + video_iframe_url = self._search_regex( + r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe', + default=None) + if video_iframe_url: + surl = smuggle_url( + video_iframe_url, {'force_videoid': video_id}) + return { + '_type': 'url', + 'id': video_id, + 'url': surl, + } + + raise ExtractorError('Could not find video iframe in any portlets') diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py new file mode 100644 index 000000000..979f3d692 --- /dev/null +++ b/youtube_dl/extractor/mitele.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + get_element_by_attribute, + parse_duration, + strip_jsonp, +) + + +class MiTeleIE(InfoExtractor): + IE_NAME = 'mitele.es' + _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/' + + _TEST = { + 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', + 'md5': '6a75fe9d0d3275bead0cb683c616fddb', + 'info_dict': { + 'id': '0fce117d', + 'ext': 'mp4', + 'title': 'Programa 144 - Tor, la web invisible', + 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'display_id': 'programa-144', + 'duration': 2913, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = mobj.group('episode') + webpage = self._download_webpage(url, episode) + embed_data_json = self._search_regex( + r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', + flags=re.DOTALL + ).replace('\'', '"') + embed_data = json.loads(embed_data_json) + + info_url = embed_data['flashvars']['host'] + info_el = self._download_xml(info_url, episode).find('./video/info') + + video_link = info_el.find('videoUrl/link').text + token_query = compat_urllib_parse.urlencode({'id': video_link}) + token_info = self._download_json( + 'http://token.mitele.es/?' + token_query, episode, + transform_source=strip_jsonp + ) + + return { + 'id': embed_data['videoId'], + 'display_id': episode, + 'title': info_el.find('title').text, + 'url': token_info['tokenizedUrl'], + 'description': get_element_by_attribute('class', 'text', webpage), + 'thumbnail': info_el.find('thumb').text, + 'duration': parse_duration(info_el.find('duration').text), + } diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py new file mode 100644 index 000000000..456807dd1 --- /dev/null +++ b/youtube_dl/extractor/movieclips.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_str, + clean_html, +) + + +class MovieClipsIE(InfoExtractor): + _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?' + _TEST = { + 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/', + 'info_dict': { + 'id': 'Wy7ZU', + 'display_id': 'my-week-with-marilyn-movie-do-you-love-me', + 'ext': 'mp4', + 'title': 'My Week with Marilyn - Do You Love Me?', + 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + show_id = display_id or video_id + + config = self._download_xml( + 'http://config.movieclips.com/player/config/%s' % video_id, + show_id, 'Downloading player config') + + if config.find('./country-region').text == 'false': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True) + + properties = config.find('./video/properties') + smil_file = properties.attrib['smil_file'] + + smil = self._download_xml(smil_file, show_id, 'Downloading SMIL') + base_url = smil.find('./head/meta').attrib['base'] + + formats = [] + for video in smil.findall('./body/switch/video'): + vbr = int(video.attrib['system-bitrate']) / 1000 + src = video.attrib['src'] + formats.append({ + 'url': base_url, + 'play_path': src, + 'ext': src.split(':')[0], + 'vbr': vbr, + 'format_id': '%dk' % vbr, + }) + + self._sort_formats(formats) + + title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title']) + description = clean_html(compat_str(properties.attrib['clip_description'])) + thumbnail = properties.attrib['image'] + categories = properties.attrib['clip_categories'].split(',') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index 280328b78..58ec81f91 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -38,7 +38,7 @@ class NuvidIE(InfoExtractor): webpage = self._download_webpage( request, video_id, 'Downloading %s page' % format_id) video_url = self._html_search_regex( - r'<a href="([^"]+)"\s*>Continue to watch video', webpage, '%s video URL' % format_id, fatal=False) + r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False) if not video_url: continue formats.append({ @@ -49,19 +49,24 @@ class NuvidIE(InfoExtractor): webpage = self._download_webpage( 'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page') title = self._html_search_regex( - r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>', webpage, 'title').strip() - thumbnail = self._html_search_regex( - r'href="(/thumbs/[^"]+)"[^>]*data-link_type="thumbs"', - webpage, 'thumbnail URL', fatal=False) + [r'<span title="([^"]+)">', + r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip() + thumbnails = [ + { + 'url': thumb_url, + } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage) + ] + thumbnail = thumbnails[0]['url'] if thumbnails else None duration = parse_duration(self._html_search_regex( - r'Length:\s*<span>(\d{2}:\d{2})</span>',webpage, 'duration', fatal=False)) + r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False)) upload_date = unified_strdate(self._html_search_regex( - r'Added:\s*<span>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload date', fatal=False)) + r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, - 'thumbnail': 'http://m.nuvid.com%s' % thumbnail, + 'thumbnails': thumbnails, + 'thumbnail': thumbnail, 'duration': duration, 'upload_date': upload_date, 'age_limit': 18, diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py new file mode 100644 index 000000000..5429592a7 --- /dev/null +++ b/youtube_dl/extractor/patreon.py @@ -0,0 +1,100 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, +) + + +class PatreonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)' + _TESTS = [ + { + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + }, + }, + { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + }, + }, + ] + + # Currently Patreon exposes download URL via hidden CSS, so login is not + # needed. Keeping this commented for when this inevitably changes. + ''' + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'redirectUrl': 'http://www.patreon.com/', + 'email': username, + 'password': password, + } + + request = compat_urllib_request.Request( + 'https://www.patreon.com/processLogin', + compat_urllib_parse.urlencode(login_form).encode('utf-8') + ) + login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() + ''' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage).strip() + + attach_fn = self._html_search_regex( + r'<div class="attach"><a target="_blank" href="([^"]+)">', + webpage, 'attachment URL', default=None) + if attach_fn is not None: + video_url = 'http://www.patreon.com' + attach_fn + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r'<strong>(.*?)</strong> is creating', webpage, 'uploader') + else: + playlist_js = self._search_regex( + r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])', + webpage, 'playlist JSON') + playlist_json = js_to_json(playlist_js) + playlist = json.loads(playlist_json) + data = playlist[0] + video_url = self._proto_relative_url(data['mp3']) + thumbnail = self._proto_relative_url(data.get('cover')) + uploader = data.get('artist') + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp3', + 'title': title, + 'uploader': uploader, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index ec95d0704..2adfde909 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -20,17 +20,53 @@ class PBSIE(InfoExtractor): ) ''' - _TEST = { - 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', - 'info_dict': { - 'id': '2365006249', - 'ext': 'mp4', - 'title': 'A More Perfect Union', - 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', - 'duration': 3190, + _TESTS = [ + { + 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', + 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'info_dict': { + 'id': '2365006249', + 'ext': 'mp4', + 'title': 'A More Perfect Union', + 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', + 'duration': 3190, + }, + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', + 'md5': '143c98aa54a346738a3d78f54c925321', + 'info_dict': { + 'id': '2365297690', + 'ext': 'mp4', + 'title': 'Losing Iraq', + 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'duration': 5050, + }, + }, + { + 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', + 'md5': 'b19856d7f5351b17a5ab1dc6a64be633', + 'info_dict': { + 'id': '2201174722', + 'ext': 'mp4', + 'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist', + 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', + 'duration': 801, + }, }, - } + { + 'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/', + 'md5': 'c62859342be2a0358d6c9eb306595978', + 'info_dict': { + 'id': '2365297708', + 'ext': 'mp4', + 'description': 'md5:68d87ef760660eb564455eb30ca464fe', + 'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', + 'duration': 6559, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + ] def _extract_ids(self, url): mobj = re.match(self._VALID_URL, url) @@ -40,15 +76,18 @@ class PBSIE(InfoExtractor): if presumptive_id: webpage = self._download_webpage(url, display_id) - # frontline video embed + MEDIA_ID_REGEXES = [ + r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed + r'class="coveplayerid">([^<]+)<', # coveplayer + ] + media_id = self._search_regex( - r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", - webpage, 'frontline video ID', fatal=False, default=None) + MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: return media_id, presumptive_id url = self._search_regex( - r'<iframe\s+id=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', + r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', webpage, 'player URL') mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py new file mode 100644 index 000000000..72df4d842 --- /dev/null +++ b/youtube_dl/extractor/playfm.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, + ExtractorError, + float_or_none, + int_or_none, +) + + +class PlayFMIE(InfoExtractor): + IE_NAME = 'play.fm' + _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])' + + _TEST = { + 'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', + 'md5': 'c505f8307825a245d0c7ad1850001f22', + 'info_dict': { + 'id': '137220', + 'ext': 'mp3', + 'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'uploader': 'Sven Tasnadi', + 'uploader_id': 'sventasnadi', + 'duration': 5627.428, + 'upload_date': '20140712', + 'view_count': int, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + upload_date = mobj.group('upload_date') + + rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) + req = compat_urllib_request.Request( + 'http://www.play.fm/flexRead/recording', data=rec_data) + req.add_header('Content-Type', 'application/x-www-form-urlencoded') + rec_doc = self._download_xml(req, video_id) + + error_node = rec_doc.find('./error') + if error_node is not None: + raise ExtractorError('An error occured: %s (code %s)' % ( + error_node.text, rec_doc.find('./status').text)) + + recording = rec_doc.find('./recording') + title = recording.find('./title').text + view_count = int_or_none(recording.find('./stats/playcount').text) + duration = float_or_none(recording.find('./duration').text, scale=1000) + thumbnail = recording.find('./image').text + + artist = recording.find('./artists/artist') + uploader = artist.find('./name').text + uploader_id = artist.find('./slug').text + + video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( + 'http:', recording.find('./url').text, + recording.find('./_class').text, recording.find('./file_id').text, + rec_doc.find('./uuid').text, video_id, + rec_doc.find('./jingle/file_id').text, + 'http%3A%2F%2Fwww.play.fm%2Fplayer', + ) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp3', + 'filesize': int_or_none(recording.find('./size').text), + 'title': title, + 'upload_date': upload_date, + 'view_count': view_count, + 'duration': duration, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + } diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 35dc5a9ff..04bd3d979 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,15 +11,16 @@ from ..utils import ( class PornotubeIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' + _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' _TEST = { - u'url': u'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', - u'file': u'1689755.flv', - u'md5': u'374dd6dcedd24234453b295209aa69b6', - u'info_dict': { - u"upload_date": u"20090708", - u"title": u"Marilyn-Monroe-Bathing", - u"age_limit": 18 + 'url': 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', + 'md5': '374dd6dcedd24234453b295209aa69b6', + 'info_dict': { + 'id': '1689755', + 'ext': 'flv', + 'upload_date': '20090708', + 'title': 'Marilyn-Monroe-Bathing', + 'age_limit': 18 } } @@ -32,22 +35,22 @@ class PornotubeIE(InfoExtractor): # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') + video_url = self._search_regex(VIDEO_URL_RE, webpage, 'video url') video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) - if upload_date: upload_date = unified_strdate(upload_date) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, 'upload date', fatal=False) + if upload_date: + upload_date = unified_strdate(upload_date) age_limit = self._rta_search(webpage) - info = {'id': video_id, - 'url': video_url, - 'uploader': None, - 'upload_date': upload_date, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'age_limit': age_limit} - - return [info] + return { + 'id': video_id, + 'url': video_url, + 'upload_date': upload_date, + 'title': video_title, + 'ext': 'flv', + 'format': 'flv', + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py new file mode 100644 index 000000000..190c8f226 --- /dev/null +++ b/youtube_dl/extractor/rtlnl.py @@ -0,0 +1,51 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RtlXlIE(InfoExtractor): + IE_NAME = 'rtlxl.nl' + _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' + + _TEST = { + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', + 'info_dict': { + 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', + 'ext': 'flv', + 'title': 'RTL Nieuws - Laat', + 'description': 'Dagelijks het laatste nieuws uit binnen- en ' + 'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van ' + 'onze mobiele apps.', + 'timestamp': 1408051800, + 'upload_date': '20140814', + }, + 'params': { + # We download the first bytes of the first fragment, it can't be + # processed by the f4m downloader beacuse it isn't complete + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uuid = mobj.group('uuid') + + info = self._download_json( + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, + uuid) + material = info['material'][0] + episode_info = info['episodes'][0] + + f4m_url = 'http://manifest.us.rtl.nl' + material['videopath'] + progname = info['abstracts'][0]['name'] + subtitle = material['title'] or info['episodes'][0]['name'] + + return { + 'id': uuid, + 'title': '%s - %s' % (progname, subtitle), + 'formats': self._extract_f4m_formats(f4m_url, uuid), + 'timestamp': material['original_date'], + 'description': episode_info['synopsis'], + } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index c2228b2f0..4dd35a47b 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -1,21 +1,66 @@ # encoding: utf-8 from __future__ import unicode_literals -import re import base64 +import re +import time from .common import InfoExtractor from ..utils import ( struct_unpack, + remove_end, ) +def _decrypt_url(png): + encrypted_data = base64.b64decode(png) + text_index = encrypted_data.find(b'tEXt') + text_chunk = encrypted_data[text_index - 4:] + length = struct_unpack('!I', text_chunk[:4])[0] + # Use bytearray to get integers when iterating in both python 2.x and 3.x + data = bytearray(text_chunk[8:8 + length]) + data = [chr(b) for b in data if b != 0] + hash_index = data.index('#') + alphabet_data = data[:hash_index] + url_data = data[hash_index + 1:] + + alphabet = [] + e = 0 + d = 0 + for l in alphabet_data: + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in url_data: + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + return url + + + class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' _VALID_URL = r'http://www\.rtve\.es/alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', 'info_dict': { @@ -23,48 +68,15 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', }, - } - - def _decrypt_url(self, png): - encrypted_data = base64.b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index-4:] - length = struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8+length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index+1:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter)*10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url + }, { + 'note': 'Live stream', + 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', + 'info_dict': { + 'id': '1694255', + 'ext': 'flv', + 'title': 'TODO', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -74,11 +86,57 @@ class RTVEALaCartaIE(InfoExtractor): video_id)['page']['items'][0] png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = self._decrypt_url(png) + video_url = _decrypt_url(png) return { 'id': video_id, 'title': info['title'], 'url': video_url, - 'thumbnail': info['image'], + 'thumbnail': info.get('image'), + 'page_url': url, + } + + +class RTVELiveIE(InfoExtractor): + IE_NAME = 'rtve.es:live' + IE_DESC = 'RTVE.es live streams' + _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/noticias/directo-la-1/', + 'info_dict': { + 'id': 'directo-la-1', + 'ext': 'flv', + 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + }, + 'params': { + 'skip_download': 'live stream', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + start_time = time.gmtime() + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_url = self._search_regex( + r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL') + title = remove_end(self._og_search_title(webpage), ' en directo') + title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) + + vidplayer_id = self._search_regex( + r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + + return { + 'id': video_id, + 'ext': 'flv', + 'title': title, + 'url': video_url, + 'app': 'rtve-live-live?ovpfv=2.1.2', + 'player_url': player_url, + 'rtmp_live': True, } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py new file mode 100644 index 000000000..34058fd4b --- /dev/null +++ b/youtube_dl/extractor/sbs.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import json +import re +from .common import InfoExtractor +from ..utils import ( + js_to_json, + remove_end, +) + + +class SBSIE(InfoExtractor): + IE_DESC = 'sbs.com.au' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/single/(?P<id>[0-9]+)/' + + _TESTS = [{ + # Original URL is handled by the generic IE which finds the iframe: + # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation + 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', + 'md5': '3150cf278965eeabb5b4cea1c963fe0a', + 'info_dict': { + 'id': '320403011771', + 'ext': 'flv', + 'title': 'Dingo Conservation', + 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction', + 'thumbnail': 're:http://.*\.jpg', + }, + 'add_ies': ['generic'], + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + release_urls_json = js_to_json(self._search_regex( + r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n', + webpage, '')) + release_urls = json.loads(release_urls_json) + theplatform_url = ( + release_urls.get('progressive') or release_urls.get('standard')) + + title = remove_end(self._og_search_title(webpage), ' (The Feed)') + description = self._html_search_meta('description', webpage) + thumbnail = self._og_search_thumbnail(webpage) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': theplatform_url, + + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f8dd7e955..fa796ce72 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor): video_id = mobj.group("video_id") if not video_id: video_id = self._html_search_regex( - r'<article class="video" data-id="(\d+?)"', + r'data-node-id="(\d+?)"', webpage, 'video id') data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 6d3b78749..affef6507 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -44,7 +44,7 @@ class VodlockerIE(InfoExtractor): req, video_id, 'Downloading video page') title = self._search_regex( - r'id="file_title".*?>\s*(.*?)\s*<span', webpage, 'title') + r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title') thumbnail = self._search_regex( r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail') url = self._search_regex( diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index a584e0896..1f330378a 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -2,27 +2,30 @@ from __future__ import unicode_literals import re +import time +import hashlib from .common import InfoExtractor from ..utils import ( + ExtractorError, unified_strdate, ) class WatIE(InfoExtractor): - _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html' + _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html' IE_NAME = 'wat.tv' _TEST = { - 'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', + 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', + 'md5': 'ce70e9223945ed26a8056d413ca55dc9', 'info_dict': { - 'id': '10631273', + 'id': '11713067', + 'display_id': 'soupe-figues-l-orange-aux-epices', 'ext': 'mp4', - 'title': 'World War Z - Philadelphia VOST', - 'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, + 'title': 'Soupe de figues à l\'orange et aux épices', + 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', + 'upload_date': '20140819', + 'duration': 120, }, } @@ -36,13 +39,20 @@ class WatIE(InfoExtractor): def real_id_for_chapter(chapter): return chapter['tc_start'].split('-')[0] mobj = re.match(self._VALID_URL, url) - short_id = mobj.group('shortID') - webpage = self._download_webpage(url, short_id) + short_id = mobj.group('short_id') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id or short_id) real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') video_info = self.download_video_info(real_id) + + if video_info.get('geolock'): + raise ExtractorError('This content is not available in your area', expected=True) + chapters = video_info['chapters'] first_chapter = chapters[0] + files = video_info['files'] + first_file = files[0] if real_id_for_chapter(first_chapter) != real_id: self.to_screen('Multipart video detected') @@ -61,12 +71,45 @@ class WatIE(InfoExtractor): upload_date = unified_strdate(first_chapter['date_diffusion']) # Otherwise we can continue and extract just one part, we have to use # the short id for getting the video url + + formats = [{ + 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, + 'format_id': 'Mobile', + }] + + fmts = [('SD', 'web')] + if first_file.get('hasHD'): + fmts.append(('HD', 'webhd')) + + def compute_token(param): + timestamp = '%08x' % int(time.time()) + magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' + return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) + + for fmt in fmts: + webid = '/%s/%s' % (fmt[1], real_id) + video_url = self._download_webpage( + 'http://www.wat.tv/get%s?token=%s&getURL=1' % (webid, compute_token(webid)), + real_id, + 'Downloding %s video URL' % fmt[0], + 'Failed to download %s video URL' % fmt[0], + False) + if not video_url: + continue + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': fmt[0], + }) + return { 'id': real_id, - 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, + 'display_id': display_id, 'title': first_chapter['title'], 'thumbnail': first_chapter['preview'], 'description': first_chapter['description'], 'view_count': video_info['views'], 'upload_date': upload_date, + 'duration': first_file['duration'], + 'formats': formats, } diff --git a/youtube_dl/extractor/wayofthemaster.py b/youtube_dl/extractor/wayofthemaster.py new file mode 100644 index 000000000..af7bb8b49 --- /dev/null +++ b/youtube_dl/extractor/wayofthemaster.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class WayOfTheMasterIE(InfoExtractor): + _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P<id>[^/?#]+)\.s?html(?:$|[?#])' + + _TEST = { + 'url': 'http://www.wayofthemaster.com/hbks.shtml', + 'md5': '5316b57487ada8480606a93cb3d18d24', + 'info_dict': { + 'id': 'hbks', + 'ext': 'mp4', + 'title': 'Intelligent Design vs. Evolution', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + r'<img src="images/title_[^"]+".*?alt="([^"]+)"', + webpage, 'title', default=None) + if title is None: + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'page title') + + url_base = self._search_regex( + r'<param\s+name="?movie"?\s+value=".*?/wotm_videoplayer_highlow[0-9]*\.swf\?vid=([^"]+)"', + webpage, 'URL base') + formats = [{ + 'format_id': 'low', + 'quality': 1, + 'url': url_base + '_low.mp4', + }, { + 'format_id': 'high', + 'quality': 2, + 'url': url_base + '_high.mp4', + }] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 5374495f9..00b6d1eba 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -14,7 +14,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' + _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' _TESTS = [ { 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5bfe5e7e5..75044d71a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -297,7 +297,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # RTMP (unnamed) @@ -446,6 +446,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return lambda s: u''.join(s[i] for i in cache_spec) except IOError: pass # No cache available + except ValueError: + try: + file_size = os.path.getsize(cache_fn) + except (OSError, IOError) as oe: + file_size = str(oe) + self._downloader.report_warning( + u'Cache %s failed (%s)' % (cache_fn, file_size)) if player_type == 'js': code = self._download_webpage( @@ -573,6 +580,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for l in lang_list: lang = l[1] + if lang in sub_lang_list: + continue params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 65b492fb3..8095400d0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -24,6 +24,7 @@ import socket import struct import subprocess import sys +import tempfile import traceback import xml.etree.ElementTree import zlib @@ -228,18 +229,42 @@ else: assert type(s) == type(u'') print(s) -# In Python 2.x, json.dump expects a bytestream. -# In Python 3.x, it writes to a character stream -if sys.version_info < (3,0): - def write_json_file(obj, fn): - with open(fn, 'wb') as f: - json.dump(obj, f) -else: - def write_json_file(obj, fn): - with open(fn, 'w', encoding='utf-8') as f: - json.dump(obj, f) -if sys.version_info >= (2,7): +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically """ + + args = { + 'suffix': '.tmp', + 'prefix': os.path.basename(fn) + '.', + 'dir': os.path.dirname(fn), + 'delete': False, + } + + # In Python 2.x, json.dump expects a bytestream. + # In Python 3.x, it writes to a character stream + if sys.version_info < (3, 0): + args['mode'] = 'wb' + else: + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**args) + + try: + with tf: + json.dump(obj, tf) + os.rename(tf.name, fn) + except: + try: + os.remove(tf.name) + except OSError: + pass + raise + + +if sys.version_info >= (2, 7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) @@ -827,8 +852,10 @@ def unified_strdate(date_str): '%b %dnd %Y %I:%M%p', '%b %dth %Y %I:%M%p', '%Y-%m-%d', + '%Y/%m/%d', '%d.%m.%Y', '%d/%m/%Y', + '%d/%m/%y', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%d.%m.%Y %H:%M', @@ -1259,6 +1286,12 @@ def remove_start(s, start): return s +def remove_end(s, end): + if s.endswith(end): + return s[:-len(end)] + return s + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip(u'/').split(u'/')[-1] @@ -1448,6 +1481,34 @@ def strip_jsonp(code): return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) +def js_to_json(code): + def fix_kv(m): + key = m.group(2) + if key.startswith("'"): + assert key.endswith("'") + assert '"' not in key + key = '"%s"' % key[1:-1] + elif not key.startswith('"'): + key = '"%s"' % key + + value = m.group(4) + if value.startswith("'"): + assert value.endswith("'") + assert '"' not in value + value = '"%s"' % value[1:-1] + + return m.group(1) + key + m.group(3) + value + + res = re.sub(r'''(?x) + ([{,]\s*) + ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) + (:\s*) + ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) + ''', fix_kv, code) + res = re.sub(r',(\s*\])', lambda m: m.group(1), res) + return res + + def qualities(quality_ids): """ Get a numeric quality value out of a list of possible values """ def q(qid): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2ef0d59e3..a05ce2eba 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.08.10' +__version__ = '2014.08.24.5' |