diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 11 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/googleplus.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/jukebox.py | 56 | ||||
-rw-r--r-- | youtube_dl/extractor/tudou.py | 32 | ||||
-rw-r--r-- | youtube_dl/extractor/worldstarhiphop.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 9 |
7 files changed, 111 insertions, 11 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9878ad942..0ea990860 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,8 +4,8 @@ from .arte import ArteTvIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE -from .comedycentral import ComedyCentralIE from .collegehumor import CollegeHumorIE +from .comedycentral import ComedyCentralIE from .dailymotion import DailymotionIE from .depositfiles import DepositFilesIE from .eighttracks import EightTracksIE @@ -21,6 +21,7 @@ from .howcast import HowcastIE from .hypem import HypemIE from .ina import InaIE from .infoq import InfoQIE +from .jukebox import JukeboxIE from .justintv import JustinTVIE from .keek import KeekIE from .liveleak import LiveLeakIE @@ -30,7 +31,6 @@ from .mtv import MTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE -from .statigram import StatigramIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE @@ -38,9 +38,11 @@ from .redtube import RedTubeIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE +from .statigram import StatigramIE from .steam import SteamIE from .teamcoco import TeamcocoIE from .ted import TEDIE +from .tudou import TudouIE from .tumblr import TumblrIE from .ustream import UstreamIE from .vbox7 import Vbox7IE @@ -48,8 +50,8 @@ from .vevo import VevoIE from .vimeo import VimeoIE from .vine import VineIE from .worldstarhiphop import WorldStarHipHopIE -from .xnxx import XNXXIE from .xhamster import XHamsterIE +from .xnxx import XNXXIE from .xvideos import XVideosIE from .yahoo import YahooIE, YahooSearchIE from .youjizz import YouJizzIE @@ -58,6 +60,7 @@ from .youporn import YouPornIE from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE from .zdf import ZDFIE + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -127,6 +130,8 @@ def gen_extractors(): StatigramIE(), BreakIE(), VevoIE(), + JukeboxIE(), + TudouIE(), GenericIE() ] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 062f4cf1e..64d63e109 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -211,7 +211,7 @@ class InfoExtractor(object): raise ExtractorError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on GitHub.' % _name) + u'please report this issue on http://yt-dl.org/bug' % _name) return None def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index e922bd140..ff2cdeebb 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -46,14 +46,18 @@ class GooglePlusIE(InfoExtractor): video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') - # Step 2, Stimulate clicking the image box to launch video - video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + # Step 2, Simulate clicking the image box to launch video + DOMAIN = 'https://plus.google.com' + video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), webpage, u'video page URL') + if not video_page.startswith(DOMAIN): + video_page = DOMAIN + video_page + webpage = self._download_webpage(video_page, video_id, u'Downloading video page') # Extract video links on video page """Extract video links of all sizes""" - pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' + pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' mobj = re.findall(pattern, webpage) if len(mobj) == 0: raise ExtractorError(u'Unable to extract video links') diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py new file mode 100644 index 000000000..c7bb234fe --- /dev/null +++ b/youtube_dl/extractor/jukebox.py @@ -0,0 +1,56 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unescapeHTML, +) + +class JukeboxIE(InfoExtractor): + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html' + _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>' + _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"' + _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>' + _IS_YOUTUBE = r'config":{"file":"(?P<youtube_url>http:[\\][/][\\][/]www[.]youtube[.]com[\\][/]watch[?]v=[^"]+)"' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + html = self._download_webpage(url, video_id) + + mobj = re.search(self._IFRAME, html) + if mobj is None: + raise ExtractorError(u'Cannot extract iframe url') + iframe_url = unescapeHTML(mobj.group('iframe')) + + iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe') + mobj = re.search(r'class="jkb_waiting"', iframe_html) + if mobj is not None: + raise ExtractorError(u'Video is not available(in your country?)!') + + self.report_extraction(video_id) + + mobj = re.search(self._VIDEO_URL, iframe_html) + if mobj is None: + mobj = re.search(self._IS_YOUTUBE, iframe_html) + if mobj is None: + raise ExtractorError(u'Cannot extract video url') + youtube_url = unescapeHTML(mobj.group('youtube_url')).replace('\/','/') + self.to_screen(u'Youtube video detected') + return self.url_result(youtube_url,ie='Youtube') + video_url = unescapeHTML(mobj.group('video_url')).replace('\/','/') + video_ext = unescapeHTML(mobj.group('video_ext')) + + mobj = re.search(self._TITLE, html) + if mobj is None: + raise ExtractorError(u'Cannot extract title') + title = unescapeHTML(mobj.group('title')) + artist = unescapeHTML(mobj.group('artist')) + + return [{'id': video_id, + 'url': video_url, + 'title': artist + '-' + title, + 'ext': video_ext + }] diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py new file mode 100644 index 000000000..9ca860ab0 --- /dev/null +++ b/youtube_dl/extractor/tudou.py @@ -0,0 +1,32 @@ +import re + +from .common import InfoExtractor + + +class TudouIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(2).replace('.html','') + webpage = self._download_webpage(url, video_id) + video_id = re.search('"k":(.+?),',webpage).group(1) + title = re.search(",kw:\"(.+)\"",webpage) + if title is None: + title = re.search(",kw: \'(.+)\'",webpage) + title = title.group(1) + thumbnail_url = re.search(",pic: \'(.+?)\'",webpage) + if thumbnail_url is None: + thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) + thumbnail_url = thumbnail_url.group(1) + info_url = "http://v2.tudou.com/f?id="+str(video_id) + webpage = self._download_webpage(info_url, video_id, "Opening the info webpage") + final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1) + ext = (final_url.split('?')[0]).split('.')[-1] + return [{ + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + }] diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index f628e4fb1..531d0889f 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -18,7 +18,7 @@ class WorldStarHipHopIE(InfoExtractor): if 'youtube' in video_url: self.to_screen(u'Youtube video detected:') - return self.url_result('%s' % video_url, ie='Youtube') + return self.url_result(video_url, ie='Youtube') if 'mp4' in video_url: ext = 'mp4' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b34c1a7b9..de653cb3d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -129,12 +129,13 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - @staticmethod - def _decrypt_signature(s): + def _decrypt_signature(self, s): """Decrypt the key the two subkeys must have a length of 43""" (a,b) = s.split('.') if len(a) != 43 or len(b) != 43: - raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid') + raise ExtractorError(u'Unable to decrypt signature, subkeys lengths %d.%d not supported; retrying might work' % (len(a), len(b))) + if self._downloader.params.get('verbose'): + self.to_screen('encrypted signature length %d.%d' % (len(a), len(b))) b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] a = a[-40:] s_dec = '.'.join((a,b))[::-1] @@ -484,6 +485,8 @@ class YoutubeIE(InfoExtractor): try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) + if not mobj: + raise ValueError('Could not find vevo ID') info = json.loads(mobj.group(1)) args = info['args'] # Easy way to know if the 's' value is in url_encoded_fmt_stream_map |