diff options
| -rw-r--r-- | test/test_all_urls.py | 18 | ||||
| -rw-r--r-- | test/test_youtube_subtitles.py | 12 | ||||
| -rw-r--r-- | test/tests.json | 27 | ||||
| -rw-r--r-- | youtube_dl/FileDownloader.py | 6 | ||||
| -rwxr-xr-x | youtube_dl/InfoExtractors.py | 205 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
6 files changed, 253 insertions, 17 deletions
| diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a40360122..dd67286a7 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -7,7 +7,7 @@ import unittest  import os  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE +from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE  class TestAllURLsMatching(unittest.TestCase):      def test_youtube_playlist_matching(self): @@ -29,6 +29,22 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec'))          self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')) +    def test_justin_tv_channelid_matching(self): +        self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) +        self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) +        self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv")) +        self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv")) +        self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv")) +        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv")) +        self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/")) +        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/")) + +    def test_justintv_videoid_matching(self): +        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483")) + +    def test_justin_tv_chapterid_matching(self): +        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361")) +      def test_youtube_extract(self):          self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')          self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index a123e6d72..c80c90cbe 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -28,7 +28,9 @@ compat_urllib_request.install_opener(opener)  class FakeDownloader(FileDownloader):      def __init__(self):          self.result = [] -        self.params = parameters +        # Different instances of the downloader can't share the same dictionary +        # some test set the "sublang" parameter, which would break the md5 checks. +        self.params = dict(parameters)      def to_screen(self, s):          print(s)      def trouble(self, s, tb=None): @@ -96,6 +98,14 @@ class TestYoutubeSubtitles(unittest.TestCase):          IE = YoutubeIE(DL)          info_dict = IE.extract('QRS8MkLhQmM')          self.assertEqual(info_dict, None) +    def test_youtube_automatic_captions(self): +        DL = FakeDownloader() +        DL.params['writesubtitles'] = True +        DL.params['subtitleslang'] = 'it' +        IE = YoutubeIE(DL) +        info_dict = IE.extract('8YoUxe5ncPo') +        sub = info_dict[0]['subtitles'][0] +        self.assertTrue(sub[2] is not None)  if __name__ == '__main__':      unittest.main() diff --git a/test/tests.json b/test/tests.json index f57ebf1c9..04be912ce 100644 --- a/test/tests.json +++ b/test/tests.json @@ -482,5 +482,32 @@        "title": "Louis C.K. Interview Pt. 1 11/3/11",        "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one."      } +  }, +  { +    "name": "XHamster", +    "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html", +    "file": "1509445.flv", +    "md5": "9f48e0e8d58e3076bb236ff412ab62fa", +    "info_dict":{ +      "title":"FemaleAgent Shy beauty takes the bait" +    } +  }, +  { +    "name": "Hypem", +    "url": "http://hypem.com/track/1v6ga/BODYWORK+-+TAME", +    "file": "1v6ga.mp3", +    "md5": "b9cc91b5af8995e9f0c1cee04c575828", +    "info_dict":{ +      "title":"TAME" +    } +  }, +  { +    "name": "Vbox7", +    "url": "http://vbox7.com/play:249bb972c2", +    "file": "249bb972c2.flv", +    "md5": "9c70d6d956f888bdc08c124acc120cfe", +    "info_dict":{ +      "title":"Смях! Чудо - чист за секунди - Скрита камера" +    }    }  ] diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 2c35a05d8..bf0f5bb9e 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -814,12 +814,10 @@ class FileDownloader(object):          self.report_destination(filename)          tmpfilename = self.temp_name(filename) -#        args = ['mmsclient', url]                                     # doesn't work anymore -#        args = ['wpro', url, '-O', tmpfilename]                       # dont work          args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url]          # Check for mplayer first          try: -            subprocess.call(args[0], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) +            subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)          except (OSError, IOError):              self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] )              return False @@ -839,7 +837,7 @@ class FileDownloader(object):              return True          else:              self.to_stderr(u"\n") -            self.report_error(u'%s exited with code %d' % (args[0], retval)) +            self.report_error(u'mplayer exited with code %d' % retval)              return False diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4ca744daf..24a77a1ab 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -376,6 +376,34 @@ class YoutubeIE(InfoExtractor):              return (u'Did not fetch video subtitles', None, None)          return (None, sub_lang, sub) +    def _request_automatic_caption(self, video_id, webpage): +        """We need the webpage for getting the captions url, pass it as an +           argument to speed up the process.""" +        sub_lang = self._downloader.params.get('subtitleslang') +        sub_format = self._downloader.params.get('subtitlesformat') +        self.to_screen(u'%s: Looking for automatic captions' % video_id) +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang +        if mobj is None: +            return [(err_msg, None, None)] +        player_config = json.loads(mobj.group(1)) +        try: +            args = player_config[u'args'] +            caption_url = args[u'ttsurl'] +            timestamp = args[u'timestamp'] +            params = compat_urllib_parse.urlencode({ +                'lang': 'en', +                'tlang': sub_lang, +                'fmt': sub_format, +                'ts': timestamp, +                'kind': 'asr', +            }) +            subtitles_url = caption_url + '&' + params +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') +            return [(None, sub_lang, sub)] +        except KeyError: +            return [(err_msg, None, None)] +      def _extract_subtitle(self, video_id):          """          Return a list with a tuple: @@ -623,7 +651,14 @@ class YoutubeIE(InfoExtractor):              if video_subtitles:                  (sub_error, sub_lang, sub) = video_subtitles[0]                  if sub_error: -                    self._downloader.report_error(sub_error) +                    # We try with the automatic captions +                    video_subtitles = self._request_automatic_caption(video_id, video_webpage) +                    (sub_error_auto, sub_lang, sub) = video_subtitles[0] +                    if sub is not None: +                        pass +                    else: +                        # We report the original error +                        self._downloader.report_error(sub_error)          if self._downloader.params.get('allsubtitles', False):              video_subtitles = self._extract_all_subtitles(video_id) @@ -1025,7 +1060,7 @@ class VimeoIE(InfoExtractor):      """Information extractor for vimeo.com."""      # _VALID_URL matches Vimeo URLs -    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)' +    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'      IE_NAME = u'vimeo'      def _real_extract(self, url, new_video=True): @@ -1037,7 +1072,7 @@ class VimeoIE(InfoExtractor):          video_id = mobj.group('id')          if not mobj.group('proto'):              url = 'https://' + url -        if mobj.group('direct_link'): +        if mobj.group('direct_link') or mobj.group('pro'):              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information @@ -1064,7 +1099,7 @@ class VimeoIE(InfoExtractor):          # Extract uploader and uploader_id          video_uploader = config["video"]["owner"]["name"] -        video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] +        video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None          # Extract video thumbnail          video_thumbnail = config["video"]["thumbnail"] @@ -1884,7 +1919,7 @@ class FacebookIE(InfoExtractor):  class BlipTVIE(InfoExtractor):      """Information extractor for blip.tv""" -    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' +    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'      _URL_EXT = r'^.*\.([a-z0-9]+)$'      IE_NAME = u'blip.tv' @@ -1897,6 +1932,10 @@ class BlipTVIE(InfoExtractor):          if mobj is None:              raise ExtractorError(u'Invalid URL: %s' % url) +        # See https://github.com/rg3/youtube-dl/issues/857 +        api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url) +        if api_mobj is not None: +            url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')          urlp = compat_urllib_parse_urlparse(url)          if urlp.path.startswith('/play/'):              request = compat_urllib_request.Request(url) @@ -3941,7 +3980,7 @@ class SpiegelIE(InfoExtractor):          video_id = m.group('videoID')          webpage = self._download_webpage(url, video_id) -        m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage) +        m = re.search(r'<div class="module-title">(.*?)</div>', webpage)          if not m:              raise ExtractorError(u'Cannot find title')          video_title = unescapeHTML(m.group(1)) @@ -4070,8 +4109,8 @@ class ZDFIE(InfoExtractor):          if streams is None:              raise ExtractorError(u'No media url found.') -        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' und mms url -        # s['media_type'] == 'hstreaming' -> use 'Quicktime' und rtsp url +        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url +        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url          # choose first/default media type and highest quality for now          for s in streams:        #find 300 - dsl1000mbit              if s['quality'] == '300' and s['media_type'] == 'wstreaming': @@ -4263,7 +4302,7 @@ class HowcastIE(InfoExtractor):          self.report_extraction(video_id) -        mobj = re.search(r'\'file\': "(http://mobile-media\.howcast\.com/\d+\.mp4)"', webpage) +        mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)          if mobj is None:              raise ExtractorError(u'Unable to extract video URL')          video_url = mobj.group(1) @@ -4444,6 +4483,149 @@ class TeamcocoIE(InfoExtractor):              'thumbnail':   thumbnail,              'description': description,          }] +         +class XHamsterIE(InfoExtractor): +    """Information Extractor for xHamster""" +    _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' + +    def _real_extract(self,url): +        mobj = re.match(self._VALID_URL, url) + +        video_id = mobj.group('id') +        mrss_url='http://xhamster.com/movies/%s/.html' % video_id +        webpage = self._download_webpage(mrss_url, video_id) +        mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) +        if mobj is None: +            raise ExtractorError(u'Unable to extract media URL') +        if len(mobj.group('server')) == 0: +            video_url = compat_urllib_parse.unquote(mobj.group('file')) +        else: +            video_url = mobj.group('server')+'/key='+mobj.group('file') +        video_extension = video_url.split('.')[-1] + +        mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage) +        if mobj is None: +            raise ExtractorError(u'Unable to extract title') +        video_title = unescapeHTML(mobj.group('title')) + +        mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage) +        if mobj is None: +            video_description = u'' +        else: +            video_description = unescapeHTML(mobj.group('description')) + +        mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) +        if mobj is None: +            raise ExtractorError(u'Unable to extract upload date') +        video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') + +        mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage) +        if mobj is None: +            video_uploader_id = u'anonymous' +        else: +            video_uploader_id = mobj.group('uploader_id') + +        mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage) +        if mobj is None: +            raise ExtractorError(u'Unable to extract thumbnail URL') +        video_thumbnail = mobj.group('thumbnail') + +        return [{ +            'id':       video_id, +            'url':      video_url, +            'ext':      video_extension, +            'title':    video_title, +            'description': video_description, +            'upload_date': video_upload_date, +            'uploader_id': video_uploader_id, +            'thumbnail': video_thumbnail +        }] + +class HypemIE(InfoExtractor): +    """Information Extractor for hypem""" +    _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        if mobj is None: +            raise ExtractorError(u'Invalid URL: %s' % url) +        track_id = mobj.group(1) + +        data = { 'ax': 1, 'ts': time.time() } +        data_encoded = compat_urllib_parse.urlencode(data) +        complete_url = url + "?" + data_encoded +        request = compat_urllib_request.Request(complete_url) +        response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url') +        cookie = urlh.headers.get('Set-Cookie', '') + +        self.report_extraction(track_id) +        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL) +        if mobj is None: +            raise ExtractorError(u'Unable to extrack tracks') +        html_tracks = mobj.group(1).strip() +        try: +            track_list = json.loads(html_tracks) +            track = track_list[u'tracks'][0] +        except ValueError: +            raise ExtractorError(u'Hypemachine contained invalid JSON.') + +        key = track[u"key"] +        track_id = track[u"id"] +        artist = track[u"artist"] +        title = track[u"song"] + +        serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) +        request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'}) +        request.add_header('cookie', cookie) +        song_data_json = self._download_webpage(request, track_id, u'Downloading metadata') +        try: +            song_data = json.loads(song_data_json) +        except ValueError: +            raise ExtractorError(u'Hypemachine contained invalid JSON.') +        final_url = song_data[u"url"] + +        return [{ +            'id':       track_id, +            'url':      final_url, +            'ext':      "mp3", +            'title':    title, +            'artist':   artist, +        }] + +class Vbox7IE(InfoExtractor): +    """Information Extractor for Vbox7""" +    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' + +    def _real_extract(self,url): +        mobj = re.match(self._VALID_URL, url) +        if mobj is None: +            raise ExtractorError(u'Invalid URL: %s' % url) +        video_id = mobj.group(1) + +        redirect_page, urlh = self._download_webpage_handle(url, video_id) +        redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1) +        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') + +        title = re.search(r'<title>(.*)</title>', webpage) +        title = (title.group(1)).split('/')[0].strip() + +        ext = "flv" +        info_url = "http://vbox7.com/play/magare.do" +        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) +        info_request = compat_urllib_request.Request(info_url, data) +        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') +        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') +        if info_response is None: +            raise ExtractorError(u'Unable to extract the media url') +        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + +        return [{ +            'id':        video_id, +            'url':       final_url, +            'ext':       ext, +            'title':     title, +            'thumbnail': thumbnail_url, +        }]  def gen_extractors():      """ Return a list of an instance of every supported extractor. @@ -4463,8 +4645,8 @@ def gen_extractors():          YahooSearchIE(),          DepositFilesIE(),          FacebookIE(), -        BlipTVUserIE(),          BlipTVIE(), +        BlipTVUserIE(),          VimeoIE(),          MyVideoIE(),          ComedyCentralIE(), @@ -4507,6 +4689,9 @@ def gen_extractors():          VineIE(),          FlickrIE(),          TeamcocoIE(), +        XHamsterIE(), +        HypemIE(), +        Vbox7IE(),          GenericIE()      ] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dbc928394..1cda7fa74 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.05.14' +__version__ = '2013.05.23' | 
