aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/test_all_urls.py18
-rw-r--r--test/test_youtube_subtitles.py12
-rw-r--r--test/tests.json27
-rw-r--r--youtube_dl/FileDownloader.py6
-rwxr-xr-xyoutube_dl/InfoExtractors.py205
-rw-r--r--youtube_dl/version.py2
6 files changed, 253 insertions, 17 deletions
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index a40360122..dd67286a7 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -7,7 +7,7 @@ import unittest
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE
+from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE
class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
@@ -29,6 +29,22 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec'))
self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos'))
+ def test_justin_tv_channelid_matching(self):
+ self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
+ self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
+ self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv"))
+ self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv"))
+ self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv"))
+ self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv"))
+ self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/"))
+ self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/"))
+
+ def test_justintv_videoid_matching(self):
+ self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483"))
+
+ def test_justin_tv_chapterid_matching(self):
+ self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))
+
def test_youtube_extract(self):
self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')
self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')
diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py
index a123e6d72..c80c90cbe 100644
--- a/test/test_youtube_subtitles.py
+++ b/test/test_youtube_subtitles.py
@@ -28,7 +28,9 @@ compat_urllib_request.install_opener(opener)
class FakeDownloader(FileDownloader):
def __init__(self):
self.result = []
- self.params = parameters
+ # Different instances of the downloader can't share the same dictionary
+ # some test set the "sublang" parameter, which would break the md5 checks.
+ self.params = dict(parameters)
def to_screen(self, s):
print(s)
def trouble(self, s, tb=None):
@@ -96,6 +98,14 @@ class TestYoutubeSubtitles(unittest.TestCase):
IE = YoutubeIE(DL)
info_dict = IE.extract('QRS8MkLhQmM')
self.assertEqual(info_dict, None)
+ def test_youtube_automatic_captions(self):
+ DL = FakeDownloader()
+ DL.params['writesubtitles'] = True
+ DL.params['subtitleslang'] = 'it'
+ IE = YoutubeIE(DL)
+ info_dict = IE.extract('8YoUxe5ncPo')
+ sub = info_dict[0]['subtitles'][0]
+ self.assertTrue(sub[2] is not None)
if __name__ == '__main__':
unittest.main()
diff --git a/test/tests.json b/test/tests.json
index f57ebf1c9..04be912ce 100644
--- a/test/tests.json
+++ b/test/tests.json
@@ -482,5 +482,32 @@
"title": "Louis C.K. Interview Pt. 1 11/3/11",
"description": "Louis C.K. got starstruck by George W. Bush, so what? Part one."
}
+ },
+ {
+ "name": "XHamster",
+ "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html",
+ "file": "1509445.flv",
+ "md5": "9f48e0e8d58e3076bb236ff412ab62fa",
+ "info_dict":{
+ "title":"FemaleAgent Shy beauty takes the bait"
+ }
+ },
+ {
+ "name": "Hypem",
+ "url": "http://hypem.com/track/1v6ga/BODYWORK+-+TAME",
+ "file": "1v6ga.mp3",
+ "md5": "b9cc91b5af8995e9f0c1cee04c575828",
+ "info_dict":{
+ "title":"TAME"
+ }
+ },
+ {
+ "name": "Vbox7",
+ "url": "http://vbox7.com/play:249bb972c2",
+ "file": "249bb972c2.flv",
+ "md5": "9c70d6d956f888bdc08c124acc120cfe",
+ "info_dict":{
+ "title":"Смях! Чудо - чист за секунди - Скрита камера"
+ }
}
]
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py
index 2c35a05d8..bf0f5bb9e 100644
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -814,12 +814,10 @@ class FileDownloader(object):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
-# args = ['mmsclient', url] # doesn't work anymore
-# args = ['wpro', url, '-O', tmpfilename] # dont work
args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url]
# Check for mplayer first
try:
- subprocess.call(args[0], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] )
return False
@@ -839,7 +837,7 @@ class FileDownloader(object):
return True
else:
self.to_stderr(u"\n")
- self.report_error(u'%s exited with code %d' % (args[0], retval))
+ self.report_error(u'mplayer exited with code %d' % retval)
return False
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 4ca744daf..24a77a1ab 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -376,6 +376,34 @@ class YoutubeIE(InfoExtractor):
return (u'Did not fetch video subtitles', None, None)
return (None, sub_lang, sub)
+ def _request_automatic_caption(self, video_id, webpage):
+ """We need the webpage for getting the captions url, pass it as an
+ argument to speed up the process."""
+ sub_lang = self._downloader.params.get('subtitleslang')
+ sub_format = self._downloader.params.get('subtitlesformat')
+ self.to_screen(u'%s: Looking for automatic captions' % video_id)
+ mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+ err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
+ if mobj is None:
+ return [(err_msg, None, None)]
+ player_config = json.loads(mobj.group(1))
+ try:
+ args = player_config[u'args']
+ caption_url = args[u'ttsurl']
+ timestamp = args[u'timestamp']
+ params = compat_urllib_parse.urlencode({
+ 'lang': 'en',
+ 'tlang': sub_lang,
+ 'fmt': sub_format,
+ 'ts': timestamp,
+ 'kind': 'asr',
+ })
+ subtitles_url = caption_url + '&' + params
+ sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
+ return [(None, sub_lang, sub)]
+ except KeyError:
+ return [(err_msg, None, None)]
+
def _extract_subtitle(self, video_id):
"""
Return a list with a tuple:
@@ -623,7 +651,14 @@ class YoutubeIE(InfoExtractor):
if video_subtitles:
(sub_error, sub_lang, sub) = video_subtitles[0]
if sub_error:
- self._downloader.report_error(sub_error)
+ # We try with the automatic captions
+ video_subtitles = self._request_automatic_caption(video_id, video_webpage)
+ (sub_error_auto, sub_lang, sub) = video_subtitles[0]
+ if sub is not None:
+ pass
+ else:
+ # We report the original error
+ self._downloader.report_error(sub_error)
if self._downloader.params.get('allsubtitles', False):
video_subtitles = self._extract_all_subtitles(video_id)
@@ -1025,7 +1060,7 @@ class VimeoIE(InfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
- _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
+ _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
IE_NAME = u'vimeo'
def _real_extract(self, url, new_video=True):
@@ -1037,7 +1072,7 @@ class VimeoIE(InfoExtractor):
video_id = mobj.group('id')
if not mobj.group('proto'):
url = 'https://' + url
- if mobj.group('direct_link'):
+ if mobj.group('direct_link') or mobj.group('pro'):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
@@ -1064,7 +1099,7 @@ class VimeoIE(InfoExtractor):
# Extract uploader and uploader_id
video_uploader = config["video"]["owner"]["name"]
- video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
+ video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
# Extract video thumbnail
video_thumbnail = config["video"]["thumbnail"]
@@ -1884,7 +1919,7 @@ class FacebookIE(InfoExtractor):
class BlipTVIE(InfoExtractor):
"""Information extractor for blip.tv"""
- _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
_URL_EXT = r'^.*\.([a-z0-9]+)$'
IE_NAME = u'blip.tv'
@@ -1897,6 +1932,10 @@ class BlipTVIE(InfoExtractor):
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
+ # See https://github.com/rg3/youtube-dl/issues/857
+ api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
+ if api_mobj is not None:
+ url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
urlp = compat_urllib_parse_urlparse(url)
if urlp.path.startswith('/play/'):
request = compat_urllib_request.Request(url)
@@ -3941,7 +3980,7 @@ class SpiegelIE(InfoExtractor):
video_id = m.group('videoID')
webpage = self._download_webpage(url, video_id)
- m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
+ m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
if not m:
raise ExtractorError(u'Cannot find title')
video_title = unescapeHTML(m.group(1))
@@ -4070,8 +4109,8 @@ class ZDFIE(InfoExtractor):
if streams is None:
raise ExtractorError(u'No media url found.')
- # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' und mms url
- # s['media_type'] == 'hstreaming' -> use 'Quicktime' und rtsp url
+ # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+ # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
# choose first/default media type and highest quality for now
for s in streams: #find 300 - dsl1000mbit
if s['quality'] == '300' and s['media_type'] == 'wstreaming':
@@ -4263,7 +4302,7 @@ class HowcastIE(InfoExtractor):
self.report_extraction(video_id)
- mobj = re.search(r'\'file\': "(http://mobile-media\.howcast\.com/\d+\.mp4)"', webpage)
+ mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract video URL')
video_url = mobj.group(1)
@@ -4444,6 +4483,149 @@ class TeamcocoIE(InfoExtractor):
'thumbnail': thumbnail,
'description': description,
}]
+
+class XHamsterIE(InfoExtractor):
+ """Information Extractor for xHamster"""
+ _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
+
+ def _real_extract(self,url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+ webpage = self._download_webpage(mrss_url, video_id)
+ mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
+ if mobj is None:
+ raise ExtractorError(u'Unable to extract media URL')
+ if len(mobj.group('server')) == 0:
+ video_url = compat_urllib_parse.unquote(mobj.group('file'))
+ else:
+ video_url = mobj.group('server')+'/key='+mobj.group('file')
+ video_extension = video_url.split('.')[-1]
+
+ mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
+ if mobj is None:
+ raise ExtractorError(u'Unable to extract title')
+ video_title = unescapeHTML(mobj.group('title'))
+
+ mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
+ if mobj is None:
+ video_description = u''
+ else:
+ video_description = unescapeHTML(mobj.group('description'))
+
+ mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
+ if mobj is None:
+ raise ExtractorError(u'Unable to extract upload date')
+ video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
+
+ mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
+ if mobj is None:
+ video_uploader_id = u'anonymous'
+ else:
+ video_uploader_id = mobj.group('uploader_id')
+
+ mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
+ if mobj is None:
+ raise ExtractorError(u'Unable to extract thumbnail URL')
+ video_thumbnail = mobj.group('thumbnail')
+
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': video_extension,
+ 'title': video_title,
+ 'description': video_description,
+ 'upload_date': video_upload_date,
+ 'uploader_id': video_uploader_id,
+ 'thumbnail': video_thumbnail
+ }]
+
+class HypemIE(InfoExtractor):
+ """Information Extractor for hypem"""
+ _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ track_id = mobj.group(1)
+
+ data = { 'ax': 1, 'ts': time.time() }
+ data_encoded = compat_urllib_parse.urlencode(data)
+ complete_url = url + "?" + data_encoded
+ request = compat_urllib_request.Request(complete_url)
+ response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
+ cookie = urlh.headers.get('Set-Cookie', '')
+
+ self.report_extraction(track_id)
+ mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
+ if mobj is None:
+ raise ExtractorError(u'Unable to extrack tracks')
+ html_tracks = mobj.group(1).strip()
+ try:
+ track_list = json.loads(html_tracks)
+ track = track_list[u'tracks'][0]
+ except ValueError:
+ raise ExtractorError(u'Hypemachine contained invalid JSON.')
+
+ key = track[u"key"]
+ track_id = track[u"id"]
+ artist = track[u"artist"]
+ title = track[u"song"]
+
+ serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
+ request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
+ request.add_header('cookie', cookie)
+ song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
+ try:
+ song_data = json.loads(song_data_json)
+ except ValueError:
+ raise ExtractorError(u'Hypemachine contained invalid JSON.')
+ final_url = song_data[u"url"]
+
+ return [{
+ 'id': track_id,
+ 'url': final_url,
+ 'ext': "mp3",
+ 'title': title,
+ 'artist': artist,
+ }]
+
+class Vbox7IE(InfoExtractor):
+ """Information Extractor for Vbox7"""
+ _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+ def _real_extract(self,url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ video_id = mobj.group(1)
+
+ redirect_page, urlh = self._download_webpage_handle(url, video_id)
+ redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
+ webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+ title = re.search(r'<title>(.*)</title>', webpage)
+ title = (title.group(1)).split('/')[0].strip()
+
+ ext = "flv"
+ info_url = "http://vbox7.com/play/magare.do"
+ data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+ info_request = compat_urllib_request.Request(info_url, data)
+ info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+ if info_response is None:
+ raise ExtractorError(u'Unable to extract the media url')
+ (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+ return [{
+ 'id': video_id,
+ 'url': final_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ }]
def gen_extractors():
""" Return a list of an instance of every supported extractor.
@@ -4463,8 +4645,8 @@ def gen_extractors():
YahooSearchIE(),
DepositFilesIE(),
FacebookIE(),
- BlipTVUserIE(),
BlipTVIE(),
+ BlipTVUserIE(),
VimeoIE(),
MyVideoIE(),
ComedyCentralIE(),
@@ -4507,6 +4689,9 @@ def gen_extractors():
VineIE(),
FlickrIE(),
TeamcocoIE(),
+ XHamsterIE(),
+ HypemIE(),
+ Vbox7IE(),
GenericIE()
]
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index dbc928394..1cda7fa74 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2013.05.14'
+__version__ = '2013.05.23'