aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py11
-rw-r--r--youtube_dl/extractor/common.py2
-rw-r--r--youtube_dl/extractor/googleplus.py10
-rw-r--r--youtube_dl/extractor/jukebox.py56
-rw-r--r--youtube_dl/extractor/tudou.py32
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py2
-rw-r--r--youtube_dl/extractor/youtube.py9
7 files changed, 111 insertions, 11 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 9878ad942..0ea990860 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -4,8 +4,8 @@ from .arte import ArteTvIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
-from .comedycentral import ComedyCentralIE
from .collegehumor import CollegeHumorIE
+from .comedycentral import ComedyCentralIE
from .dailymotion import DailymotionIE
from .depositfiles import DepositFilesIE
from .eighttracks import EightTracksIE
@@ -21,6 +21,7 @@ from .howcast import HowcastIE
from .hypem import HypemIE
from .ina import InaIE
from .infoq import InfoQIE
+from .jukebox import JukeboxIE
from .justintv import JustinTVIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
@@ -30,7 +31,6 @@ from .mtv import MTVIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .nba import NBAIE
-from .statigram import StatigramIE
from .photobucket import PhotobucketIE
from .pornotube import PornotubeIE
from .rbmaradio import RBMARadioIE
@@ -38,9 +38,11 @@ from .redtube import RedTubeIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE
from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
+from .statigram import StatigramIE
from .steam import SteamIE
from .teamcoco import TeamcocoIE
from .ted import TEDIE
+from .tudou import TudouIE
from .tumblr import TumblrIE
from .ustream import UstreamIE
from .vbox7 import Vbox7IE
@@ -48,8 +50,8 @@ from .vevo import VevoIE
from .vimeo import VimeoIE
from .vine import VineIE
from .worldstarhiphop import WorldStarHipHopIE
-from .xnxx import XNXXIE
from .xhamster import XHamsterIE
+from .xnxx import XNXXIE
from .xvideos import XVideosIE
from .yahoo import YahooIE, YahooSearchIE
from .youjizz import YouJizzIE
@@ -58,6 +60,7 @@ from .youporn import YouPornIE
from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
from .zdf import ZDFIE
+
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
@@ -127,6 +130,8 @@ def gen_extractors():
StatigramIE(),
BreakIE(),
VevoIE(),
+ JukeboxIE(),
+ TudouIE(),
GenericIE()
]
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 062f4cf1e..64d63e109 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -211,7 +211,7 @@ class InfoExtractor(object):
raise ExtractorError(u'Unable to extract %s' % _name)
else:
self._downloader.report_warning(u'unable to extract %s; '
- u'please report this issue on GitHub.' % _name)
+ u'please report this issue on http://yt-dl.org/bug' % _name)
return None
def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index e922bd140..ff2cdeebb 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -46,14 +46,18 @@ class GooglePlusIE(InfoExtractor):
video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
webpage, 'title', default=u'NA')
- # Step 2, Stimulate clicking the image box to launch video
- video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
+ # Step 2, Simulate clicking the image box to launch video
+ DOMAIN = 'https://plus.google.com'
+ video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
webpage, u'video page URL')
+ if not video_page.startswith(DOMAIN):
+ video_page = DOMAIN + video_page
+
webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
# Extract video links on video page
"""Extract video links of all sizes"""
- pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
+ pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
mobj = re.findall(pattern, webpage)
if len(mobj) == 0:
raise ExtractorError(u'Unable to extract video links')
diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py
new file mode 100644
index 000000000..c7bb234fe
--- /dev/null
+++ b/youtube_dl/extractor/jukebox.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unescapeHTML,
+)
+
+class JukeboxIE(InfoExtractor):
+ _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html'
+ _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
+ _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
+ _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'
+ _IS_YOUTUBE = r'config":{"file":"(?P<youtube_url>http:[\\][/][\\][/]www[.]youtube[.]com[\\][/]watch[?]v=[^"]+)"'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+
+ html = self._download_webpage(url, video_id)
+
+ mobj = re.search(self._IFRAME, html)
+ if mobj is None:
+ raise ExtractorError(u'Cannot extract iframe url')
+ iframe_url = unescapeHTML(mobj.group('iframe'))
+
+ iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
+ mobj = re.search(r'class="jkb_waiting"', iframe_html)
+ if mobj is not None:
+ raise ExtractorError(u'Video is not available(in your country?)!')
+
+ self.report_extraction(video_id)
+
+ mobj = re.search(self._VIDEO_URL, iframe_html)
+ if mobj is None:
+ mobj = re.search(self._IS_YOUTUBE, iframe_html)
+ if mobj is None:
+ raise ExtractorError(u'Cannot extract video url')
+ youtube_url = unescapeHTML(mobj.group('youtube_url')).replace('\/','/')
+ self.to_screen(u'Youtube video detected')
+ return self.url_result(youtube_url,ie='Youtube')
+ video_url = unescapeHTML(mobj.group('video_url')).replace('\/','/')
+ video_ext = unescapeHTML(mobj.group('video_ext'))
+
+ mobj = re.search(self._TITLE, html)
+ if mobj is None:
+ raise ExtractorError(u'Cannot extract title')
+ title = unescapeHTML(mobj.group('title'))
+ artist = unescapeHTML(mobj.group('artist'))
+
+ return [{'id': video_id,
+ 'url': video_url,
+ 'title': artist + '-' + title,
+ 'ext': video_ext
+ }]
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
new file mode 100644
index 000000000..9ca860ab0
--- /dev/null
+++ b/youtube_dl/extractor/tudou.py
@@ -0,0 +1,32 @@
+import re
+
+from .common import InfoExtractor
+
+
+class TudouIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(2).replace('.html','')
+ webpage = self._download_webpage(url, video_id)
+ video_id = re.search('"k":(.+?),',webpage).group(1)
+ title = re.search(",kw:\"(.+)\"",webpage)
+ if title is None:
+ title = re.search(",kw: \'(.+)\'",webpage)
+ title = title.group(1)
+ thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
+ if thumbnail_url is None:
+ thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
+ thumbnail_url = thumbnail_url.group(1)
+ info_url = "http://v2.tudou.com/f?id="+str(video_id)
+ webpage = self._download_webpage(info_url, video_id, "Opening the info webpage")
+ final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1)
+ ext = (final_url.split('?')[0]).split('.')[-1]
+ return [{
+ 'id': video_id,
+ 'url': final_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ }]
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index f628e4fb1..531d0889f 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -18,7 +18,7 @@ class WorldStarHipHopIE(InfoExtractor):
if 'youtube' in video_url:
self.to_screen(u'Youtube video detected:')
- return self.url_result('%s' % video_url, ie='Youtube')
+ return self.url_result(video_url, ie='Youtube')
if 'mp4' in video_url:
ext = 'mp4'
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b34c1a7b9..de653cb3d 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -129,12 +129,13 @@ class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self.to_screen(u'RTMP download detected')
- @staticmethod
- def _decrypt_signature(s):
+ def _decrypt_signature(self, s):
"""Decrypt the key the two subkeys must have a length of 43"""
(a,b) = s.split('.')
if len(a) != 43 or len(b) != 43:
- raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
+ raise ExtractorError(u'Unable to decrypt signature, subkeys lengths %d.%d not supported; retrying might work' % (len(a), len(b)))
+ if self._downloader.params.get('verbose'):
+ self.to_screen('encrypted signature length %d.%d' % (len(a), len(b)))
b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
a = a[-40:]
s_dec = '.'.join((a,b))[::-1]
@@ -484,6 +485,8 @@ class YoutubeIE(InfoExtractor):
try:
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
+ if not mobj:
+ raise ValueError('Could not find vevo ID')
info = json.loads(mobj.group(1))
args = info['args']
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map