diff options
Diffstat (limited to 'youtube_dl/extractor')
31 files changed, 601 insertions, 238 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index db69af361..0d933986f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -72,6 +72,7 @@ from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE +from .keezmovies import KeezMoviesIE from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE @@ -94,6 +95,7 @@ from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .pornhub import PornHubIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE @@ -109,6 +111,7 @@ from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import SouthParkStudiosIE +from .spankwire import SpankwireIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE @@ -121,6 +124,7 @@ from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 82a785a19..465df8cf0 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -17,8 +17,8 @@ class AddAnimeIE(InfoExtractor): IE_NAME = u'AddAnime' _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - u'file': u'24MR3YO5SAS9.flv', - u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'file': u'24MR3YO5SAS9.mp4', + u'md5': u'72954ea10bc979ab5e2eb288b21425a0', u'info_dict': { u"description": u"One Piece 606", u"title": u"One Piece 606" @@ -60,8 +60,10 @@ class AddAnimeIE(InfoExtractor): note=u'Confirming after redirect') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + video_url = self._search_regex(r"var (?:hq|normal)_video_file = '(.*?)';", webpage, u'video file URL') + + video_extension = video_url[-3:] # mp4 or flv ? video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) @@ -69,7 +71,7 @@ class AddAnimeIE(InfoExtractor): '_type': 'video', 'id': video_id, 'url': video_url, - 'ext': 'flv', + 'ext': video_extension, 'title': video_title, 'description': video_description } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 5ee8a67b1..d39b48951 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -174,12 +174,27 @@ class ArteTVPlus7IE(InfoExtractor): # Some formats use the m3u8 protocol formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) # We order the formats by quality - formats = sorted(formats, key=lambda f: int(f.get('height',-1))) + formats = list(formats) # in python3 filter returns an iterator + if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: + sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) + else: + sort_key = lambda f: int(f.get('height',-1)) + formats = sorted(formats, key=sort_key) # Prefer videos without subtitles in the same language formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) # Pick the best quality def _format(format_info): + quality = format_info['quality'] + m_quality = re.match(r'\w*? - (\d*)p', quality) + if m_quality is not None: + quality = m_quality.group(1) + if format_info.get('versionCode') is not None: + format_id = u'%s-%s' % (quality, format_info['versionCode']) + else: + format_id = quality info = { + 'format_id': format_id, + 'format_note': format_info.get('versionLibelle'), 'width': format_info.get('width'), 'height': format_info.get('height'), } @@ -192,8 +207,6 @@ class ArteTVPlus7IE(InfoExtractor): info['ext'] = determine_ext(info['url']) return info info_dict['formats'] = [_format(f) for f in formats] - # TODO: Remove when #980 has been merged - info_dict.update(info_dict['formats'][-1]) return info_dict @@ -207,7 +220,7 @@ class ArteTVCreativeIE(ArteTVPlus7IE): u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', u'file': u'050489-002.mp4', u'info_dict': { - u'title': u'Agentur Amateur #2 - Corporate Design', + u'title': u'Agentur Amateur / Agence Amateur #2 : Corporate Design', }, } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 6925b96c2..2fe1033f0 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -55,30 +55,30 @@ class CinemassacreIE(InfoExtractor): video_description = None playerdata = self._download_webpage(playerdata_url, video_id) - base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'', - playerdata, u'base_url') - base_url += '/Cinemassacre/' - # Important: The file names in playerdata are not used by the player and even wrong for some videos - sd_file = 'Cinemassacre-%s_high.mp4' % video_id - hd_file = 'Cinemassacre-%s.mp4' % video_id - video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id + url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url') + + sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file') + hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file') + video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False) formats = [ { - 'url': base_url + sd_file, + 'url': url, + 'play_path': 'mp4:' + sd_file, 'ext': 'flv', 'format': 'sd', 'format_id': 'sd', }, { - 'url': base_url + hd_file, + 'url': url, + 'play_path': 'mp4:' + hd_file, 'ext': 'flv', 'format': 'hd', 'format_id': 'hd', }, ] - info = { + return { 'id': video_id, 'title': video_title, 'formats': formats, @@ -86,6 +86,3 @@ class CinemassacreIE(InfoExtractor): 'upload_date': video_date, 'thumbnail': video_thumbnail, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d4af3b5eb..aaa5c24c8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,6 +14,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + RegexNotFoundError, unescapeHTML, ) @@ -61,9 +62,12 @@ class InfoExtractor(object): * ext Will be calculated from url if missing * format A human-readable description of the format ("mp4 container with h264/opus"). - Calculated from width and height if missing. + Calculated from the format_id, width, height + and format_note fields if missing. * format_id A short description of the format ("mp4_h264_opus" or "19") + * format_note Additional info about the format + ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known @@ -228,7 +232,7 @@ class InfoExtractor(object): Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. In case of failure return a default value or raise a WARNING or a - ExtractorError, depending on fatal, specifying the field name. + RegexNotFoundError, depending on fatal, specifying the field name. """ if isinstance(pattern, (str, compat_str, compiled_regex_type)): mobj = re.search(pattern, string, flags) @@ -248,7 +252,7 @@ class InfoExtractor(object): elif default is not None: return default elif fatal: - raise ExtractorError(u'Unable to extract %s' % _name) + raise RegexNotFoundError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' u'please report this issue on http://yt-dl.org/bug' % _name) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7d8353946..4c0488245 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -28,6 +28,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' IE_NAME = u'dailymotion' + + _FORMATS = [ + (u'stream_h264_ld_url', u'ld'), + (u'stream_h264_url', u'standard'), + (u'stream_h264_hq_url', u'hq'), + (u'stream_h264_hd_url', u'hd'), + (u'stream_h264_hd1080_url', u'hd180'), + ] + _TESTS = [ { u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', @@ -60,7 +69,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): video_id = mobj.group(1).split('_')[0].split('?')[0] - video_extension = 'mp4' url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information @@ -99,18 +107,24 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] raise ExtractorError(msg, expected=True) - # TODO: support choosing qualities - - for key in ['stream_h264_hd1080_url','stream_h264_hd_url', - 'stream_h264_hq_url','stream_h264_url', - 'stream_h264_ld_url']: - if info.get(key):#key in info and info[key]: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: + formats = [] + for (key, format_id) in self._FORMATS: + video_url = info.get(key) + if video_url is not None: + m_size = re.search(r'H264-(\d+)x(\d+)', video_url) + if m_size is not None: + width, height = m_size.group(1), m_size.group(2) + else: + width, height = None, None + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': format_id, + 'width': width, + 'height': height, + }) + if not formats: raise ExtractorError(u'Unable to extract video URL') - video_url = info[max_quality] # subtitles video_subtitles = self.extract_subtitles(video_id) @@ -120,11 +134,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return [{ 'id': video_id, - 'url': video_url, + 'formats': formats, 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), - 'ext': video_extension, 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'] }] diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index cced06811..2cfbcd363 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -101,7 +101,7 @@ class EightTracksIE(InfoExtractor): first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) next_url = first_url res = [] - for i in itertools.count(): + for i in range(track_count): api_json = self._download_webpage(next_url, playlist_id, note=u'Downloading song information %s/%s' % (str(i+1), track_count), errnote=u'Failed to download song information') @@ -116,7 +116,5 @@ class EightTracksIE(InfoExtractor): 'ext': 'm4a', } res.append(info) - if api_data['set']['at_last_track']: - break next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) return res diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 3443f19c5..c74556579 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -11,14 +11,14 @@ class ExfmIE(InfoExtractor): _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' _TESTS = [ { - u'url': u'http://ex.fm/song/1bgtzg', - u'file': u'95223130.mp3', - u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', + u'url': u'http://ex.fm/song/eh359', + u'file': u'44216187.mp3', + u'md5': u'e45513df5631e6d760970b14cc0c11e7', u'info_dict': { - u"title": u"We Can't Stop - Miley Cyrus", - u"uploader": u"Miley Cyrus", - u'upload_date': u'20130603', - u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC', + u"title": u"Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive", + u"uploader": u"deadjournalist", + u'upload_date': u'20120424', + u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', }, u'note': u'Soundcloud song', }, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 9d1bc0751..f8bdfc2d3 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -19,7 +19,8 @@ class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' - _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' + _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' + _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' IE_NAME = u'facebook' _TEST = { @@ -36,50 +37,56 @@ class FacebookIE(InfoExtractor): """Report attempt to log in.""" self.to_screen(u'Logging in') - def _real_initialize(self): - if self._downloader is None: - return - - useremail = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - useremail = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - useremail = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) - return - + def _login(self): + (useremail, password) = self._get_login_info() if useremail is None: return - # Log in + login_page_req = compat_urllib_request.Request(self._LOGIN_URL) + login_page_req.add_header('Cookie', 'locale=en_US') + self.report_login() + login_page = self._download_webpage(login_page_req, None, note=False, + errnote=u'Unable to download login page') + lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd') + lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd') + login_form = { 'email': useremail, 'pass': password, - 'login': 'Log+In' + 'lsd': lsd, + 'lgnrnd': lgnrnd, + 'next': 'http://facebook.com/home.php', + 'default_persistent': '0', + 'legacy_return': '1', + 'timezone': '-60', + 'trynum': '1', } request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') try: - self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return + + check_form = { + 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'), + 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'), + 'name_action_selected': 'dont_save', + 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'), + } + check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form)) + check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_response = compat_urllib_request.urlopen(check_req).read() + if re.search(r'id="checkpointSubmitButton"', check_response) is not None: + self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return + def _real_initialize(self): + self._login() + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -93,7 +100,13 @@ class FacebookIE(InfoExtractor): AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) if not m: - raise ExtractorError(u'Cannot parse data') + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) + if m_msg is not None: + raise ExtractorError( + u'The video is not available, Facebook said: "%s"' % m_msg.group(1), + expected=True) + else: + raise ExtractorError(u'Cannot parse data') data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index deaa4ed2d..89ed08db4 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -5,8 +5,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( determine_ext, - clean_html, - get_element_by_attribute, ) @@ -47,12 +45,12 @@ class FazIE(InfoExtractor): 'format_id': code.lower(), }) - descr_html = get_element_by_attribute('class', 'Content Copy', webpage) + descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description') info = { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, - 'description': clean_html(descr_html), + 'description': descr, 'thumbnail': config.find('STILL/STILL_BIG').text, } # TODO: Remove when #980 has been merged diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 69e0a7bd2..2c8fcf5ae 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -25,7 +25,7 @@ class GenericIE(InfoExtractor): { u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', u'file': u'13601338388002.mp4', - u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', + u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd', u'info_dict': { u"uploader": u"www.hodiho.fr", u"title": u"R\u00e9gis plante sa Jeep" @@ -41,7 +41,17 @@ class GenericIE(InfoExtractor): u"uploader_id": u"skillsmatter", u"uploader": u"Skills Matter", } - } + }, + # bandcamp page with custom domain + { + u'url': u'http://bronyrock.com/track/the-pony-mash', + u'file': u'3235767654.mp3', + u'info_dict': { + u'title': u'The Pony Mash', + u'uploader': u'M_Pallante', + }, + u'skip': u'There is a limit of 200 free downloads / month for the test song', + }, ] def report_download_webpage(self, video_id): @@ -155,6 +165,12 @@ class GenericIE(InfoExtractor): surl = unescapeHTML(mobj.group(1)) return self.url_result(surl, 'Youtube') + # Look for Bandcamp pages with custom domain + mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) + if mobj is not None: + burl = unescapeHTML(mobj.group(1)) + return self.url_result(burl, 'Bandcamp') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index ab12d7e93..2570746b2 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -41,9 +41,9 @@ class GooglePlusIE(InfoExtractor): # Extract update date upload_date = self._html_search_regex( - r'''(?x)<a.+?class="o-T-s\s[^"]+"\s+style="display:\s*none"\s*> + r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*> ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''', - webpage, u'upload date', fatal=False) + webpage, u'upload date', fatal=False, flags=re.VERBOSE) if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 5986459d6..be8e05f53 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -19,7 +19,7 @@ class InternetVideoArchiveIE(InfoExtractor): u'info_dict': { u'title': u'SKYFALL', u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - u'duration': 156, + u'duration': 153, }, } @@ -74,7 +74,7 @@ class InternetVideoArchiveIE(InfoExtractor): }) formats = sorted(formats, key=lambda f: f['bitrate']) - info = { + return { 'id': video_id, 'title': item.find('title').text, 'formats': formats, @@ -82,6 +82,3 @@ class InternetVideoArchiveIE(InfoExtractor): 'description': item.find('description').text, 'duration': int(attr['duration']), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py new file mode 100644 index 000000000..5e05900da --- /dev/null +++ b/youtube_dl/extractor/keezmovies.py @@ -0,0 +1,61 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) +from ..aes import ( + aes_decrypt_text +) + +class KeezMoviesIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))' + _TEST = { + u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', + u'file': u'1214711.mp4', + u'md5': u'6e297b7e789329923fcf83abb67c9289', + u'info_dict': { + u"title": u"Petite Asian Lady Mai Playing In Bathtub", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + # embedded video + mobj = re.search(r'href="([^"]+)"></iframe>', webpage) + if mobj: + embedded_url = mobj.group(1) + return self.url_result(embedded_url) + + video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title') + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) + if webpage.find('encrypted=true')!=-1: + password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password') + video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + format = "-".join( format ) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'title': video_title, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e537648ff..234b9e80f 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -23,7 +23,7 @@ class MetacafeIE(InfoExtractor): _TESTS = [{ u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - u"file": u"_aUehQsCQtM.flv", + u"file": u"_aUehQsCQtM.mp4", u"info_dict": { u"upload_date": u"20090102", u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!", diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index e8d43dd13..224f56ac8 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -90,8 +90,8 @@ class NHLVideocenterIE(NHLBaseInfoExtractor): r'{statusIndex:0,index:0,.*?id:(.*?),'], webpage, u'category id') playlist_title = self._html_search_regex( - r'\?catid=%s">(.*?)</a>' % cat_id, - webpage, u'playlist title', flags=re.DOTALL) + r'tab0"[^>]*?>(.*?)</td>', + webpage, u'playlist title', flags=re.DOTALL).lower().capitalize() data = compat_urllib_parse.urlencode({ 'cid': cat_id, diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index ab52ad401..241cc160b 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -20,7 +20,10 @@ class NowVideoIE(InfoExtractor): video_id = mobj.group('id') webpage_url = 'http://www.nowvideo.ch/video/' + video_id + embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id webpage = self._download_webpage(webpage_url, video_id) + embed_page = self._download_webpage(embed_url, video_id, + u'Downloading embed page') self.report_extraction(video_id) @@ -28,7 +31,7 @@ class NowVideoIE(InfoExtractor): webpage, u'video title') video_key = self._search_regex(r'var fkzd="(.*)";', - webpage, u'video key') + embed_page, u'video key') api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) api_response = self._download_webpage(api_call, video_id, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py new file mode 100644 index 000000000..5e2454f1b --- /dev/null +++ b/youtube_dl/extractor/pornhub.py @@ -0,0 +1,69 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class PornHubIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' + _TEST = { + u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015', + u'file': u'648719015.mp4', + u'md5': u'882f488fa1f0026f023f33576004a2ed', + u'info_dict': { + u"uploader": u"BABES-COM", + u"title": u"Seductive Indian beauty strips down and fingers her pink pussy", + u"age_limit": 18 + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, u'title') + video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False) + if thumbnail: + thumbnail = compat_urllib_parse.unquote(thumbnail) + + video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + if webpage.find('"encrypted":true') != -1: + password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password').replace('+', ' ') + video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + + formats = [] + for video_url in video_urls: + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[5].split('_')[:2] + format = "-".join( format ) + formats.append({ + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + }) + formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'formats': formats, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5d770ec28..35dc5a9ff 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -16,7 +16,8 @@ class PornotubeIE(InfoExtractor): u'md5': u'374dd6dcedd24234453b295209aa69b6', u'info_dict': { u"upload_date": u"20090708", - u"title": u"Marilyn-Monroe-Bathing" + u"title": u"Marilyn-Monroe-Bathing", + u"age_limit": 18 } } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 365aade56..994778e16 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -10,7 +10,8 @@ class RedTubeIE(InfoExtractor): u'file': u'66418.mp4', u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', u'info_dict': { - u"title": u"Sucked on a toilet" + u"title": u"Sucked on a toilet", + u"age_limit": 18, } } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index d1b08c9bc..9ac7c3be8 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -63,13 +63,12 @@ class RTLnowIE(InfoExtractor): }, }, { - u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1', - u'file': u'127367.flv', + u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1', + u'file': u'129679.flv', u'info_dict': { - u'upload_date': u'20130926', - u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...', - u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin', - u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg', + u'upload_date': u'20131016', + u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...', + u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig', }, u'params': { u'skip_download': True, diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py new file mode 100644 index 000000000..32df0a7fb --- /dev/null +++ b/youtube_dl/extractor/spankwire.py @@ -0,0 +1,74 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class SpankwireIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)' + _TEST = { + u'url': u'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + u'file': u'103545.mp4', + u'md5': u'1b3f55e345500552dbc252a3e9c1af43', + u'info_dict': { + u"uploader": u"oreusz", + u"title": u"Buckcherry`s X Rated Music Video Crazy Bitch", + u"description": u"Crazy Bitch X rated music video.", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title') + video_uploader = self._html_search_regex(r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False) + description = self._html_search_regex(r'>\s*Description:</div>\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False) + if len(description) == 0: + description = None + + video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) + if webpage.find('flashvars\.encrypted = "true"') != -1: + password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, u'password').replace('+', ' ') + video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + + formats = [] + for video_url in video_urls: + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + format = "-".join( format ) + formats.append({ + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + }) + formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'description': description, + 'formats': formats, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py new file mode 100644 index 000000000..aea9d9a24 --- /dev/null +++ b/youtube_dl/extractor/tube8.py @@ -0,0 +1,65 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class Tube8IE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/[^/]+/[^/]+/(?P<videoid>[0-9]+)/?)' + _TEST = { + u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/', + u'file': u'229795.mp4', + u'md5': u'e9e0b0c86734e5e3766e653509475db0', + u'info_dict': { + u"description": u"hot teen Kasia grinding", + u"uploader": u"unknown", + u"title": u"Kasia music video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'videotitle ="([^"]+)', webpage, u'title') + video_description = self._html_search_regex(r'>Description:</strong>(.+?)<', webpage, u'description', fatal=False) + video_uploader = self._html_search_regex(r'>Submitted by:</strong>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False) + if thumbnail: + thumbnail = thumbnail.replace('\\/', '/') + + video_url = self._html_search_regex(r'"video_url":"([^"]+)', webpage, u'video_url') + if webpage.find('"encrypted":true')!=-1: + password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password') + video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + format = "-".join( format ) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'description': video_description, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index d89f84094..265dd5b91 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -16,7 +16,7 @@ class VideoDetectiveIE(InfoExtractor): u'info_dict': { u'title': u'KICK-ASS 2', u'description': u'md5:65ba37ad619165afac7d432eaded6013', - u'duration': 138, + u'duration': 135, }, } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2de56ac81..ef90fecc0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import json import re import itertools @@ -10,6 +11,7 @@ from ..utils import ( clean_html, get_element_by_attribute, ExtractorError, + RegexNotFoundError, std_headers, unsmuggle_url, ) @@ -25,7 +27,7 @@ class VimeoIE(InfoExtractor): { u'url': u'http://vimeo.com/56015672', u'file': u'56015672.mp4', - u'md5': u'8879b6cc097e987f02484baf890129e5', + u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', u'info_dict': { u"upload_date": u"20121220", u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", @@ -54,7 +56,22 @@ class VimeoIE(InfoExtractor): u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', u'uploader': u'The BLN & Business of Software', }, - } + }, + { + u'url': u'http://vimeo.com/68375962', + u'file': u'68375962.mp4', + u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7', + u'note': u'Video protected with password', + u'info_dict': { + u'title': u'youtube-dl password protected test video', + u'upload_date': u'20130614', + u'uploader_id': u'user18948128', + u'uploader': u'Jaime Marquínez Ferrándiz', + }, + u'params': { + u'videopassword': u'youtube-dl', + }, + }, ] def _login(self): @@ -129,18 +146,26 @@ class VimeoIE(InfoExtractor): # Extract the config JSON try: - config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], - webpage, u'info section', flags=re.DOTALL) - config = json.loads(config) - except: + try: + config_url = self._html_search_regex( + r' data-config-url="(.+?)"', webpage, u'config URL') + config_json = self._download_webpage(config_url, video_id) + config = json.loads(config_json) + except RegexNotFoundError: + # For pro videos or player.vimeo.com urls + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) + config = json.loads(config) + except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') - if re.search('If so please provide the correct password.', webpage): + if re.search('<form[^>]+?id="pw_form"', webpage) is not None: self._verify_video_password(url, video_id, webpage) return self._real_extract(url) else: - raise ExtractorError(u'Unable to extract info section') + raise ExtractorError(u'Unable to extract info section', + cause=e) # Extract title video_title = config["video"]["title"] @@ -179,46 +204,45 @@ class VimeoIE(InfoExtractor): # Vimeo specific: extract video codec and quality information # First consider quality, then codecs, then take everything - # TODO bind to format param - codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] + codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] files = { 'hd': [], 'sd': [], 'other': []} config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: - if codec_name in config_files: - if 'hd' in config_files[codec_name]: - files['hd'].append((codec_name, codec_extension, 'hd')) - elif 'sd' in config_files[codec_name]: - files['sd'].append((codec_name, codec_extension, 'sd')) + for quality in config_files.get(codec_name, []): + format_id = '-'.join((codec_name, quality)).lower() + key = quality if quality in files else 'other' + video_url = None + if isinstance(config_files[codec_name], dict): + file_info = config_files[codec_name][quality] + video_url = file_info.get('url') else: - files['other'].append((codec_name, codec_extension, config_files[codec_name][0])) - - for quality in ('hd', 'sd', 'other'): - if len(files[quality]) > 0: - video_quality = files[quality][0][2] - video_codec = files[quality][0][0] - video_extension = files[quality][0][1] - self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) - break - else: - raise ExtractorError(u'No known codec found') + file_info = {} + if video_url is None: + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, quality, codec_name.upper()) - video_url = None - if isinstance(config_files[video_codec], dict): - video_url = config_files[video_codec][video_quality].get("url") - if video_url is None: - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) + files[key].append({ + 'ext': codec_extension, + 'url': video_url, + 'format_id': format_id, + 'width': file_info.get('width'), + 'height': file_info.get('height'), + }) + formats = [] + for key in ('other', 'sd', 'hd'): + formats += files[key] + if len(formats) == 0: + raise ExtractorError(u'No known codec found') return [{ 'id': video_id, - 'url': video_url, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'upload_date': video_upload_date, 'title': video_title, - 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, + 'formats': formats, }] diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 361619694..7444d3393 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -19,7 +19,8 @@ class XHamsterIE(InfoExtractor): u'info_dict': { u"upload_date": u"20121014", u"uploader_id": u"Ruseful2011", - u"title": u"FemaleAgent Shy beauty takes the bait" + u"title": u"FemaleAgent Shy beauty takes the bait", + u"age_limit": 18, } }, { @@ -27,28 +28,33 @@ class XHamsterIE(InfoExtractor): u'file': u'2221348.flv', u'md5': u'e767b9475de189320f691f49c679c4c7', u'info_dict': { - u"upload_date": u"20130914", - u"uploader_id": u"jojo747400", - u"title": u"Britney Spears Sexy Booty" + u"upload_date": u"20130914", + u"uploader_id": u"jojo747400", + u"title": u"Britney Spears Sexy Booty", + u"age_limit": 18, } }] def _real_extract(self,url): + def extract_video_url(webpage): + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + if len(mobj.group('server')) == 0: + return compat_urllib_parse.unquote(mobj.group('file')) + else: + return mobj.group('server')+'/key='+mobj.group('file') + + def is_hd(webpage): + return webpage.find('<div class=\'icon iconHD\'>') != -1 + mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') seo = mobj.group('seo') - mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo) + mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - if len(mobj.group('server')) == 0: - video_url = compat_urllib_parse.unquote(mobj.group('file')) - else: - video_url = mobj.group('server')+'/key='+mobj.group('file') - video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, u'title') @@ -72,13 +78,34 @@ class XHamsterIE(InfoExtractor): video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage, u'thumbnail', fatal=False) - return [{ - 'id': video_id, - 'url': video_url, - 'ext': determine_ext(video_url), - 'title': video_title, + age_limit = self._rta_search(webpage) + + video_url = extract_video_url(webpage) + hd = is_hd(webpage) + formats = [{ + 'url': video_url, + 'ext': determine_ext(video_url), + 'format': 'hd' if hd else 'sd', + 'format_id': 'hd' if hd else 'sd', + }] + if not hd: + webpage = self._download_webpage(mrss_url+'?hd', video_id) + if is_hd(webpage): + video_url = extract_video_url(webpage) + formats.append({ + 'url': video_url, + 'ext': determine_ext(video_url), + 'format': 'hd', + 'format_id': 'hd', + }) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, - 'thumbnail': video_thumbnail - }] + 'thumbnail': video_thumbnail, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 40d848900..8a0eb1afd 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -18,7 +18,8 @@ class XNXXIE(InfoExtractor): u'file': u'1135332.flv', u'md5': u'0831677e2b4761795f68d417e0b7b445', u'info_dict': { - u"title": u"lida \u00bb Naked Funny Actress (5)" + u"title": u"lida \u00bb Naked Funny Actress (5)", + u"age_limit": 18, } } @@ -50,4 +51,5 @@ class XNXXIE(InfoExtractor): 'ext': 'flv', 'thumbnail': video_thumbnail, 'description': None, + 'age_limit': 18, }] diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index c3b9736d7..90138d7e5 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -13,7 +13,8 @@ class XVideosIE(InfoExtractor): u'file': u'939581.flv', u'md5': u'1d0c835822f0a71a7bf011855db929d0', u'info_dict': { - u"title": u"Funny Porns By >>>>S<<<<<< -1" + u"title": u"Funny Porns By >>>>S<<<<<< -1", + u"age_limit": 18, } } @@ -46,6 +47,7 @@ class XVideosIE(InfoExtractor): 'ext': 'flv', 'thumbnail': video_thumbnail, 'description': None, + 'age_limit': 18, } return [info] diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 1265639e8..1fcc518ac 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -13,7 +13,8 @@ class YouJizzIE(InfoExtractor): u'file': u'2189178.flv', u'md5': u'07e15fa469ba384c7693fd246905547c', u'info_dict': { - u"title": u"Zeichentrick 1" + u"title": u"Zeichentrick 1", + u"age_limit": 18, } } @@ -25,6 +26,8 @@ class YouJizzIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) + age_limit = self._rta_search(webpage) + # Get the video title video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>', webpage, u'title').strip() @@ -60,6 +63,7 @@ class YouJizzIE(InfoExtractor): 'title': video_title, 'ext': 'flv', 'format': 'flv', - 'player_url': embed_page_url} + 'player_url': embed_page_url, + 'age_limit': age_limit} return [info] diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index b1f93dd1b..e46a9b4d6 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -17,7 +17,7 @@ from ..aes import ( ) class YouPornIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' _TEST = { u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', u'file': u'505835.mp4', @@ -26,27 +26,15 @@ class YouPornIE(InfoExtractor): u"upload_date": u"20101221", u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", u"uploader": u"Ask Dan And Jennifer", - u"title": u"Sex Ed: Is It Safe To Masturbate Daily?" + u"title": u"Sex Ed: Is It Safe To Masturbate Daily?", + u"age_limit": 18, } } - def _print_formats(self, formats): - """Print all available formats""" - print(u'Available formats:') - print(u'ext\t\tformat') - print(u'---------------------------------') - for format in formats: - print(u'%s\t\t%s' % (format['ext'], format['format'])) - - def _specific(self, req_format, formats): - for x in formats: - if x["format"] == req_format: - return x - return None - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') @@ -70,27 +58,22 @@ class YouPornIE(InfoExtractor): except KeyError: raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) - # Get all of the formats available + # Get all of the links from the page DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, u'download list').strip() - - # Get all of the links from the page - LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' + LINK_RE = r'<a href="([^"]+)">' links = re.findall(LINK_RE, download_list_html) - - # Get link of hd video if available - mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage) - if mobj != None: - encrypted_video_url = mobj.group(u'encrypted_video_url') - video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') - links = [video_url] + links + + # Get all encrypted links + encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage) + for encrypted_link in encrypted_links: + link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') + links.append(link) if not links: raise ExtractorError(u'ERROR: no known formats available for video') - self.to_screen(u'Links found: %d' % len(links)) - formats = [] for link in links: @@ -102,39 +85,32 @@ class YouPornIE(InfoExtractor): path = compat_urllib_parse_urlparse( video_url ).path extension = os.path.splitext( path )[1][1:] format = path.split('/')[4].split('_')[:2] + # size = format[0] # bitrate = format[1] format = "-".join( format ) # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ - 'id': video_id, 'url': video_url, - 'uploader': video_uploader, - 'upload_date': upload_date, - 'title': video_title, 'ext': extension, 'format': format, - 'thumbnail': thumbnail, - 'description': video_description, - 'age_limit': age_limit, + 'format_id': format, }) - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - req_format = self._downloader.params.get('format', 'best') - self.to_screen(u'Format: %s' % req_format) - - if req_format is None or req_format == 'best': - return [formats[0]] - elif req_format == 'worst': - return [formats[-1]] - elif req_format in ('-1', 'all'): - return formats - else: - format = self._specific( req_format, formats ) - if format is None: - raise ExtractorError(u'Requested format not available') - return [format] + # Sort and remove doubles + formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + for i in range(len(formats)-1,0,-1): + if formats[i]['format_id'] == formats[i-1]['format_id']: + del formats[i] + + return { + 'id': video_id, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': video_title, + 'thumbnail': thumbnail, + 'description': video_description, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb7c42830..d05d0a8c1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -236,11 +236,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '136': 'mp4', '137': 'mp4', '138': 'mp4', - '139': 'mp4', - '140': 'mp4', - '141': 'mp4', '160': 'mp4', + # Dash mp4 audio + '139': 'm4a', + '140': 'm4a', + '141': 'm4a', + # Dash webm '171': 'webm', '172': 'webm', @@ -346,7 +348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): }, { u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U", - u"file": u"1ltcDfZMA3U.flv", + u"file": u"1ltcDfZMA3U.mp4", u"note": u"Test VEVO video (#897)", u"info_dict": { u"upload_date": u"20070518", @@ -1150,7 +1152,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): list_page = self._download_webpage(list_url, video_id) caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) original_lang_node = caption_list.find('track') - if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' : + if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : self._downloader.report_warning(u'Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] @@ -1403,32 +1405,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # this signatures are encrypted if 'url_encoded_fmt_stream_map' not in args: raise ValueError(u'No stream_map present') # caught below - m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map']) + re_signature = re.compile(r'[&,]s=') + m_s = re_signature.search(args['url_encoded_fmt_stream_map']) if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] - m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u'')) + m_s = re_signature.search(args.get('adaptive_fmts', u'')) if m_s is not None: - if 'url_encoded_fmt_stream_map' in video_info: - video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts'] - else: - video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']] - elif 'adaptive_fmts' in video_info: - if 'url_encoded_fmt_stream_map' in video_info: - video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0] + if 'adaptive_fmts' in video_info: + video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts'] else: - video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts'] + video_info['adaptive_fmts'] = [args['adaptive_fmts']] except ValueError: pass if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() video_url_list = [(None, video_info['conn'][0])] - elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: - if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]: + elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: + encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] + if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) url_map = {} - for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): + for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) if 'itag' in url_data and 'url' in url_data: url = url_data['url'][0] @@ -1481,13 +1480,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') results = [] - for format_param, video_real_url in video_url_list: + for itag, video_real_url in video_url_list: # Extension - video_extension = self._video_extensions.get(format_param, 'flv') + video_extension = self._video_extensions.get(itag, 'flv') - video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension, - self._video_dimensions.get(format_param, '???'), - ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '') + video_format = '{0} - {1}{2}'.format(itag if itag else video_extension, + self._video_dimensions.get(itag, '???'), + ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '') results.append({ 'id': video_id, @@ -1498,6 +1497,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'title': video_title, 'ext': video_extension, 'format': video_format, + 'format_id': itag, 'thumbnail': video_thumbnail, 'description': video_description, 'player_url': player_url, |