diff options
30 files changed, 431 insertions, 80 deletions
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ffebb4ae5..58cf9c313 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -128,6 +128,18 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], u'35') + def test_add_extra_info(self): + test_dict = { + 'extractor': 'Foo', + } + extra_info = { + 'extractor': 'Bar', + 'playlist': 'funny videos', + } + YDL.add_extra_info(test_dict, extra_info) + self.assertEqual(test_dict['extractor'], 'Foo') + self.assertEqual(test_dict['playlist'], 'funny videos') + if __name__ == '__main__': unittest.main() diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index c596415c4..ba3580ea4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -22,7 +22,7 @@ class TestDailymotionSubtitles(unittest.TestCase): return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict['subtitles'] def test_no_writesubtitles(self): subtitles = self.getSubtitles() self.assertEqual(subtitles, None) diff --git a/test/test_download.py b/test/test_download.py index dfb04d010..73379beb1 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -148,6 +148,9 @@ def generator(test_case): # Check for the presence of mandatory fields for key in ('id', 'url', 'title', 'ext'): self.assertTrue(key in info_dict.keys() and info_dict[key]) + # Check for mandatory fields that are automatically set by YoutubeDL + for key in ['webpage_url', 'extractor', 'extractor_key']: + self.assertTrue(info_dict.get(key), u'Missing field: %s' % key) finally: try_rm_tcs_files() diff --git a/test/test_playlists.py b/test/test_playlists.py index d6a8d56df..de1e8d88e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -20,6 +20,7 @@ from youtube_dl.extractor import ( SoundcloudUserIE, LivestreamIE, NHLVideocenterIE, + BambuserChannelIE, ) @@ -85,5 +86,13 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Highlights') self.assertEqual(len(result['entries']), 12) + def test_bambuser_channel(self): + dl = FakeYDL() + ie = BambuserChannelIE(dl) + result = ie.extract('http://bambuser.com/channel/pixelversity') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'pixelversity') + self.assertTrue(len(result['entries']) >= 66) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7f73ea360..86a6fd043 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -318,6 +318,12 @@ class YoutubeDL(object): % info_dict) return None + @staticmethod + def add_extra_info(info_dict, extra_info): + '''Set the keys from extra_info in info dict if they are missing''' + for key, value in extra_info.items(): + info_dict.setdefault(key, value) + def extract_info(self, url, download=True, ie_key=None, extra_info={}): ''' Returns a list with a dictionary for each video we find. @@ -344,17 +350,17 @@ class YoutubeDL(object): break if isinstance(ie_result, list): # Backwards compatibility: old IE result format - for result in ie_result: - result.update(extra_info) ie_result = { '_type': 'compat_list', 'entries': ie_result, } - else: - ie_result.update(extra_info) - if 'extractor' not in ie_result: - ie_result['extractor'] = ie.IE_NAME - return self.process_ie_result(ie_result, download=download) + self.add_extra_info(ie_result, + { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'extractor_key': ie.ie_key(), + }) + return self.process_ie_result(ie_result, download, extra_info) except ExtractorError as de: # An error we somewhat expected self.report_error(compat_str(de), de.format_traceback()) break @@ -378,7 +384,7 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system if result_type == 'video': - ie_result.update(extra_info) + self.add_extra_info(ie_result, extra_info) return self.process_video_result(ie_result) elif result_type == 'url': # We have to add extra_info to the results because it may be @@ -388,6 +394,7 @@ class YoutubeDL(object): ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'playlist': + self.add_extra_info(ie_result, extra_info) # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) self.to_screen(u'[download] Downloading playlist: %s' % playlist) @@ -413,12 +420,10 @@ class YoutubeDL(object): extra = { 'playlist': playlist, 'playlist_index': i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'extractor_key': ie_result['extractor_key'], } - if not 'extractor' in entry: - # We set the extractor, if it's an url it will be set then to - # the new extractor, but if it's already a video we must make - # sure it's present: see issue #877 - entry['extractor'] = ie_result['extractor'] entry_result = self.process_ie_result(entry, download=download, extra_info=extra) @@ -427,10 +432,15 @@ class YoutubeDL(object): return ie_result elif result_type == 'compat_list': def _fixup(r): - r.setdefault('extractor', ie_result['extractor']) + self.add_extra_info(r, + { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'extractor_key': ie_result['extractor_key'], + }) return r ie_result['entries'] = [ - self.process_ie_result(_fixup(r), download=download) + self.process_ie_result(_fixup(r), download, extra_info) for r in ie_result['entries'] ] return ie_result diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index caaf54456..888a91cce 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .arte import ( ArteTVFutureIE, ) from .auengine import AUEngineIE +from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE @@ -39,6 +40,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE +from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fktv import ( @@ -81,6 +83,7 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE +from .mofosex import MofosexIE from .mtv import MTVIE from .muzu import MuzuTVIE from .myspace import MySpaceIE @@ -142,6 +145,7 @@ from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE from .weibo import WeiboIE @@ -150,6 +154,7 @@ from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE +from .xtube import XTubeIE from .yahoo import YahooIE, YahooSearchIE from .youjizz import YouJizzIE from .youku import YoukuIE @@ -158,6 +163,7 @@ from .youtube import ( YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, + YoutubeSearchDateIE, YoutubeUserIE, YoutubeChannelIE, YoutubeShowIE, diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py new file mode 100644 index 000000000..f3b36f473 --- /dev/null +++ b/youtube_dl/extractor/bambuser.py @@ -0,0 +1,80 @@ +import re +import json +import itertools + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, +) + + +class BambuserIE(InfoExtractor): + IE_NAME = u'bambuser' + _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)' + _API_KEY = '005f64509e19a868399060af746a00aa' + + _TEST = { + u'url': u'http://bambuser.com/v/4050584', + u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', + u'info_dict': { + u'id': u'4050584', + u'ext': u'flv', + u'title': u'Education engineering days - lightning talks', + u'duration': 3741, + u'uploader': u'pixelversity', + u'uploader_id': u'344706', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = ('http://player-c.api.bambuser.com/getVideo.json?' + '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json)['result'] + + return { + 'id': video_id, + 'title': info['title'], + 'url': info['url'], + 'thumbnail': info.get('preview'), + 'duration': int(info['length']), + 'view_count': int(info['views_total']), + 'uploader': info['username'], + 'uploader_id': info['uid'], + } + + +class BambuserChannelIE(InfoExtractor): + IE_NAME = u'bambuser:channel' + _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)' + # The maximum number we can get with each request + _STEP = 50 + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + urls = [] + last_id = '' + for i in itertools.count(1): + req_url = ('http://bambuser.com/xhr-api/index.php?username={user}' + '&sort=created&access_mode=0%2C1%2C2&limit={count}' + '&method=broadcast&format=json&vid_older_than={last}' + ).format(user=user, count=self._STEP, last=last_id) + req = compat_urllib_request.Request(req_url) + # Without setting this header, we wouldn't get any result + req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) + info_json = self._download_webpage(req, user, + u'Downloading page %d' % i) + results = json.loads(info_json)['result'] + if len(results) == 0: + break + last_id = results[-1]['vid'] + urls.extend(self.url_result(v['page'], 'Bambuser') for v in results) + + return { + '_type': 'playlist', + 'title': user, + 'entries': urls, + } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 1392f382a..0d9b87a34 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,7 +23,7 @@ class BrightcoveIE(InfoExtractor): # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', - u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'md5': u'8eccab865181d29ec2958f32a6a754f5', u'note': u'Test Brightcove downloads and detection in GenericIE', u'info_dict': { u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', @@ -122,12 +122,10 @@ class BrightcoveIE(InfoExtractor): best_format = renditions[-1] info.update({ 'url': best_format['defaultURL'], - 'ext': 'mp4', }) elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], - 'ext': 'flv', }) else: raise ExtractorError(u'Unable to extract video url for %s' % info['id']) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2fe1033f0..8f9396d6b 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -41,7 +41,7 @@ class CinemassacreIE(InfoExtractor): webpage_url = u'http://' + mobj.group('url') webpage = self._download_webpage(webpage_url, None) # Don't know video id yet video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) if not mobj: raise ExtractorError(u'Can\'t extract embed url and video id') playerdata_url = mobj.group(u'embed_url') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cef4dce85..e0ccba533 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,6 +71,9 @@ class InfoExtractor(object): ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + webpage_url: The url to the video webpage, if given to youtube-dl it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) Unless mentioned otherwise, the fields should be Unicode strings. diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py index d43348955..2c9fb5f2e 100644 --- a/youtube_dl/extractor/depositfiles.py +++ b/youtube_dl/extractor/depositfiles.py @@ -25,7 +25,7 @@ class DepositFilesIE(InfoExtractor): url = 'http://depositfiles.com/en/files/' + file_id # Retrieve file webpage with 'Free download' button pressed - free_download_indication = { 'gateway_result' : '1' } + free_download_indication = {'gateway_result' : '1'} request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication)) try: self.report_download_webpage(file_id) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index c74556579..a51d79b08 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor): u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', }, u'note': u'Soundcloud song', + u'skip': u'The site is down too often', }, { u'url': u'http://ex.fm/song/wddt8', @@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor): u'title': u'Safe and Sound', u'uploader': u'Capital Cities', }, + u'skip': u'The site is down too often', }, ] diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py new file mode 100644 index 000000000..1c20e4364 --- /dev/null +++ b/youtube_dl/extractor/extremetube.py @@ -0,0 +1,50 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) + +class ExtremeTubeIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' + _TEST = { + u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', + u'file': u'652431.mp4', + u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', + u'info_dict': { + u"title": u"Music Video 14 british euro brit european cumshots swallow", + u"uploader": u"unknown", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title') + uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False) + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] + format = path.split('/')[5].split('_')[:2] + format = "-".join(format) + + return { + 'id': video_id, + 'title': video_title, + 'uploader': uploader, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index ab2b59103..9bd06e7c7 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -30,7 +30,7 @@ class HypemIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) track_id = mobj.group(1) - data = { 'ax': 1, 'ts': time.time() } + data = {'ax': 1, 'ts': time.time()} data_encoded = compat_urllib_parse.urlencode(data) complete_url = url + "?" + data_encoded request = compat_urllib_request.Request(complete_url) @@ -68,4 +68,4 @@ class HypemIE(InfoExtractor): 'ext': "mp3", 'title': title, 'artist': artist, - }]
\ No newline at end of file + }] diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 5e05900da..29658a7d6 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -12,7 +12,7 @@ from ..aes import ( ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' _TEST = { u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', u'file': u'1214711.mp4', @@ -43,10 +43,10 @@ class KeezMoviesIE(InfoExtractor): if webpage.find('encrypted=true')!=-1: password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password') video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] format = path.split('/')[4].split('_')[:2] - format = "-".join( format ) + format = "-".join(format) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index d04da98c8..4531fd6ab 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -40,13 +40,9 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - player = get_meta_content('twitter:player', webpage) - if player is None: - raise ExtractorError('Couldn\'t extract event api url') - api_url = player.replace('/player', '') - api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) - info = json.loads(self._download_webpage(api_url, event_name, - u'Downloading event info')) + config_json = self._search_regex(r'window.config = ({.*?});', + webpage, u'window config') + info = json.loads(config_json)['event'] videos = [self._extract_video_info(video_data['data']) for video_data in info['feed']['data'] if video_data['type'] == u'video'] return self.playlist_result(videos, info['id'], info['full_name']) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 234b9e80f..91480ba87 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -20,7 +20,9 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' - _TESTS = [{ + _TESTS = [ + # Youtube video + { u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", u"file": u"_aUehQsCQtM.mp4", @@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor): u"uploader_id": u"PBS" } }, + # Normal metacafe video + { + u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', + u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad', + u'info_dict': { + u'id': u'11121940', + u'ext': u'mp4', + u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4', + u'uploader': u'ign', + u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', + }, + }, + # AnyClip video { u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", u"file": u"an-dVVXnuY7Jh77J.mp4", u"info_dict": { u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", u"uploader": u"anyclip", - u"description": u"md5:38c711dd98f5bb87acf973d573442e67" - } - }] + u"description": u"md5:38c711dd98f5bb87acf973d573442e67", + }, + }, + # age-restricted video + { + u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', + u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09', + u'info_dict': { + u'id': u'5186653', + u'ext': u'mp4', + u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', + u'uploader': u'Dwayne Pipe', + u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b', + u'age_limit': 18, + }, + }, + ] def report_disclaimer(self): @@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor): 'submit': "Continue - I'm over 18", } request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') try: self.report_age_confirmation() compat_urllib_request.urlopen(request).read() @@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor): # Retrieve video webpage to extract further information req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) - req.headers['Cookie'] = 'flashVersion=0;' + + # AnyClip videos require the flashversion cookie so that we get the link + # to the mp4 file + mobj_an = re.match(r'^an-(.*?)$', video_id) + if mobj_an: + req.headers['Cookie'] = 'flashVersion=0;' webpage = self._download_webpage(req, video_id) # Extract URL, uploader and title from webpage @@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor): r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) + if re.search(r'"contentRating":"restricted"', webpage) is not None: + age_limit = 18 + else: + age_limit = 0 + return { '_type': 'video', 'id': video_id, @@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor): 'upload_date': None, 'title': video_title, 'ext': video_ext, + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py new file mode 100644 index 000000000..b9430b09b --- /dev/null +++ b/youtube_dl/extractor/mofosex.py @@ -0,0 +1,49 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) + +class MofosexIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)' + _TEST = { + u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', + u'file': u'5018.mp4', + u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a', + u'info_dict': { + u"title": u"Japanese Teen Music Video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, u'title') + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url')) + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] + format = path.split('/')[5].split('_')[:2] + format = "-".join(format) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'title': video_title, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 5e2454f1b..75cf4bb9f 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -47,10 +47,10 @@ class PornHubIE(InfoExtractor): formats = [] for video_url in video_urls: - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] format = path.split('/')[5].split('_')[:2] - format = "-".join( format ) + format = "-".join(format) formats.append({ 'url': video_url, 'ext': extension, diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 32df0a7fb..97f9c268a 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -49,10 +49,10 @@ class SpankwireIE(InfoExtractor): formats = [] for video_url in video_urls: - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] format = path.split('/')[4].split('_')[:2] - format = "-".join( format ) + format = "-".join(format) formats.append({ 'url': video_url, 'ext': extension, diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index aea9d9a24..d4b7603c7 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -46,10 +46,10 @@ class Tube8IE(InfoExtractor): if webpage.find('"encrypted":true')!=-1: password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password') video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] format = path.split('/')[4].split('_')[:2] - format = "-".join( format ) + format = "-".join(format) return { 'id': video_id, diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 12c84a985..826804af3 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -8,7 +8,7 @@ from ..utils import ( class ViddlerIE(InfoExtractor): - _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[0-9]+)' + _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TEST = { u"url": u"http://www.viddler.com/v/43903784", u'file': u'43903784.mp4', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c7d864a2b..d465bf20b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' + _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ @@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') - if not mobj.group('proto'): - url = 'https://' + url - elif mobj.group('pro'): + if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id - elif mobj.group('direct_link'): + else: url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information @@ -205,7 +203,7 @@ class VimeoIE(InfoExtractor): # Vimeo specific: extract video codec and quality information # First consider quality, then codecs, then take everything codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] - files = { 'hd': [], 'sd': [], 'other': []} + files = {'hd': [], 'sd': [], 'other': []} config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: for quality in config_files.get(codec_name, []): @@ -234,7 +232,7 @@ class VimeoIE(InfoExtractor): if len(formats) == 0: raise ExtractorError(u'No known codec found') - return [{ + return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, @@ -243,7 +241,8 @@ class VimeoIE(InfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, 'formats': formats, - }] + 'webpage_url': url, + } class VimeoChannelIE(InfoExtractor): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py new file mode 100644 index 000000000..90d8a6d07 --- /dev/null +++ b/youtube_dl/extractor/vk.py @@ -0,0 +1,45 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_str, + unescapeHTML, +) + + +class VKIE(InfoExtractor): + IE_NAME = u'vk.com' + _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' + + _TEST = { + u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + u'md5': u'0deae91935c54e00003c2a00646315f0', + u'info_dict': { + u'id': u'162222515', + u'ext': u'flv', + u'title': u'ProtivoGunz - Хуёвая песня', + u'uploader': u'Noize MC', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id + info_page = self._download_webpage(info_url, video_id) + m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) + if m_yt is not None: + self.to_screen(u'Youtube video detected') + return self.url_result(m_yt.group(1), 'Youtube') + vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') + vars = json.loads(vars_json) + + return { + 'id': compat_str(vars['vid']), + 'url': vars['url240'], + 'title': unescapeHTML(vars['md_title']), + 'thumbnail': vars['jpg'], + 'uploader': vars['md_author'], + } diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py new file mode 100644 index 000000000..03ad88bed --- /dev/null +++ b/youtube_dl/extractor/xtube.py @@ -0,0 +1,55 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) + +class XTubeIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' + _TEST = { + u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_', + u'file': u'kVTUy_G222_.mp4', + u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab', + u'info_dict': { + u"title": u"strange erotica", + u"description": u"surreal gay themed erotica...almost an ET kind of thing", + u"uploader": u"greenshowers", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title') + video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False) + video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None) + video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/') + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] + format = path.split('/')[5].split('_')[:2] + format[0] += 'p' + format[1] += 'k' + format = "-".join(format) + + return { + 'id': video_id, + 'title': video_title, + 'uploader': video_uploader, + 'description': video_description, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 464b498f5..34e6afb20 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -132,7 +132,7 @@ class YahooSearchIE(SearchInfoExtractor): mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r) e = self.url_result('http://' + mobj.group('url'), 'Yahoo') res['entries'].append(e) - if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )): + if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)): break return res diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 9d88c17f5..a8fd40c83 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -18,7 +18,7 @@ class YoukuIE(InfoExtractor): u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", u"file": u"XNDgyMDQ2NTQw_part00.flv", u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b", - u"params": { u"test": False }, + u"params": {u"test": False}, u"info_dict": { u"title": u"youtube-dl test video \"'/\\ä↭𝕐" } @@ -37,8 +37,8 @@ class YoukuIE(InfoExtractor): source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890") seed = float(seed) for i in range(len(source)): - seed = (seed * 211 + 30031 ) % 65536 - index = math.floor(seed / 65536 * len(source) ) + seed = (seed * 211 + 30031) % 65536 + index = math.floor(seed / 65536 * len(source)) mixed.append(source[int(index)]) source.remove(source[int(index)]) #return ''.join(mixed) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e46a9b4d6..bd0f2cae0 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -81,14 +81,14 @@ class YouPornIE(InfoExtractor): # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 # A path looks like this: # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 - video_url = unescapeHTML( link ) - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] + video_url = unescapeHTML(link) + path = compat_urllib_parse_urlparse(video_url).path + extension = os.path.splitext(path)[1][1:] format = path.split('/')[4].split('_')[:2] # size = format[0] # bitrate = format[1] - format = "-".join( format ) + format = "-".join(format) # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f3a2a32b4..74a381fe2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -340,18 +340,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): } }, { - u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U", - u"file": u"1ltcDfZMA3U.mp4", - u"note": u"Test VEVO video (#897)", - u"info_dict": { - u"upload_date": u"20070518", - u"title": u"Maps - It Will Find You", - u"description": u"Music video by Maps performing It Will Find You.", - u"uploader": u"MuteUSA", - u"uploader_id": u"MuteUSA" - } - }, - { u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY", u"file": u"UxxajLWwzqY.mp4", u"note": u"Test generic use_cipher_signature video (#897)", @@ -1111,7 +1099,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat'), - 'name': l[0], + 'name': l[0].encode('utf-8'), }) url = u'http://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url @@ -1497,7 +1485,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'subtitles': video_subtitles, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations + 'annotations': video_annotations, + 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, }) return results @@ -1743,6 +1732,10 @@ class YoutubeSearchIE(SearchInfoExtractor): videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] return self.playlist_result(videos, query) +class YoutubeSearchDateIE(YoutubeSearchIE): + _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' + _SEARCH_KEY = 'ytsearchdate' + IE_DESC = u'YouTube.com searches, newest videos first' class YoutubeShowIE(InfoExtractor): IE_DESC = u'YouTube.com (multi-season) shows' diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e8eade7ad..cc0f9cb4e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.30' +__version__ = '2013.11.03' |