diff options
| author | Ismael Mejia <iemejia@gmail.com> | 2013-08-22 23:29:36 +0200 | 
|---|---|---|
| committer | Ismael Mejia <iemejia@gmail.com> | 2013-08-22 23:29:36 +0200 | 
| commit | 18b4e04f1c663e0ea695f6501b860f85af9d7ca1 (patch) | |
| tree | d60ebbf51b8c50f808c6c251fc6c02547052a9dc | |
| parent | d80a064eff4fe2416f9db36b07f1e2ca641f1334 (diff) | |
| parent | 1865ed31b955795f9859df5c1c400d172ae9a28a (diff) | |
Merge branch 'master' into subtitles_rework
44 files changed, 1388 insertions, 458 deletions
| diff --git a/.gitignore b/.gitignore index fca34b8ba..61cb6bc3c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ build/  dist/  MANIFEST  README.txt -README.md  youtube-dl.1  youtube-dl.bash-completion  youtube-dl diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 2b3879f0a..dca963e8f 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -11,30 +11,42 @@ tests = [      # 90      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`",       "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), +    # 89  +    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", +     "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"),      # 88      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<",       "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"),      # 87      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", -     "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), -    # 86 - vfl_ymO4Z 2013/06/27 +     "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), +    # 86      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", -     "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), -    # 85 - vflSAFCP9 2013/07/19 +     "yuioplkjhgfdsazecvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"), +    # 85      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", -     "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), +     ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"),      # 84      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", -     "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), -    # 83 - vflcaqGO8 2013/07/11 +     "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ09876543q1mnbvcxzasdfghjklpoiuew2"), +    # 83      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", -     "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"), +     ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"),      # 82      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",       "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), -    # 81 +    # 81 - vflLC8JvQ 2013/07/25      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", -     "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."), +     "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), +    # 79 - vflLC8JvQ 2013/07/25 (sporadic) +    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", +     "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), +] + +tests_age_gate = [ +    # 86 - vflqinMWD +    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", +     "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"),  ]  def find_matching(wrong, right): @@ -87,6 +99,8 @@ def genall(tests):  def main():      print(genall(tests)) +    print(u'    Age gate:') +    print(genall(tests_age_gate))  if __name__ == '__main__':      main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index c73d0e467..c54faa380 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -50,6 +50,7 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')          self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')          self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc'), 'BaW_jenozKc') +        self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch_popup?v=BaW_jenozKc'), 'BaW_jenozKc')      def test_no_duplicates(self):          ies = gen_extractors() diff --git a/test/test_playlists.py b/test/test_playlists.py new file mode 100644 index 000000000..65de3a55c --- /dev/null +++ b/test/test_playlists.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import sys +import unittest +import json + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE +from youtube_dl.utils import * + +from helper import FakeYDL + +class TestPlaylists(unittest.TestCase): +    def assertIsPlaylist(self, info): +        """Make sure the info has '_type' set to 'playlist'""" +        self.assertEqual(info['_type'], 'playlist') + +    def test_dailymotion_playlist(self): +        dl = FakeYDL() +        ie = DailymotionPlaylistIE(dl) +        result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'SPORT') +        self.assertTrue(len(result['entries']) > 20) + +    def test_vimeo_channel(self): +        dl = FakeYDL() +        ie = VimeoChannelIE(dl) +        result = ie.extract('http://vimeo.com/channels/tributes') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'Vimeo Tributes') +        self.assertTrue(len(result['entries']) > 24) + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py deleted file mode 100644 index 51b300532..000000000 --- a/test/test_youtube_sig.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -import unittest -import sys - -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from youtube_dl.extractor.youtube import YoutubeIE -from helper import FakeYDL - -sig = YoutubeIE(FakeYDL())._decrypt_signature - -class TestYoutubeSig(unittest.TestCase): -    def test_92(self): -        wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8" -        right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7" -        self.assertEqual(sig(wrong), right) - -    def test_90(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`" -        right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|" -        self.assertEqual(sig(wrong), right) - -    def test_88(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<" -        right = "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej" -        self.assertEqual(sig(wrong), right) - -    def test_87(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" -        right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" -        self.assertEqual(sig(wrong), right) - -    def test_86(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<" -        right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@" -        self.assertEqual(sig(wrong), right) - -    def test_85(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<" -        right = "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c" -        self.assertEqual(sig(wrong), right) - -    def test_84(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<" -        right = "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1" -        self.assertEqual(sig(wrong), right) - -    def test_83(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<" -        right = "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<" -        self.assertEqual(sig(wrong), right) - -    def test_82(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<" -        right = "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9" -        self.assertEqual(sig(wrong), right) - -    def test_81(self): -        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>." -        right = "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>." -        self.assertEqual(sig(wrong), right) - -if __name__ == '__main__': -    unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 155895fe2..217c4a52f 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -79,9 +79,13 @@ class FileDownloader(object):          rate = float(current) / dif          eta = int((float(total) - float(current)) / rate)          (eta_mins, eta_secs) = divmod(eta, 60) -        if eta_mins > 99: -            return '--:--' -        return '%02d:%02d' % (eta_mins, eta_secs) +        (eta_hours, eta_mins) = divmod(eta_mins, 60) +        if eta_hours > 99: +            return '--:--:--' +        if eta_hours == 0: +            return '%02d:%02d' % (eta_mins, eta_secs) +        else: +            return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs)      @staticmethod      def calc_speed(start, now, bytes): @@ -329,6 +333,35 @@ class FileDownloader(object):              self.report_error(u'mplayer exited with code %d' % retval)              return False +    def _download_m3u8_with_ffmpeg(self, filename, url): +        self.report_destination(filename) +        tmpfilename = self.temp_name(filename) + +        args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename] +        # Check for ffmpeg first +        try: +            subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) +        except (OSError, IOError): +            self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] ) +            return False + +        retval = subprocess.call(args) +        if retval == 0: +            fsize = os.path.getsize(encodeFilename(tmpfilename)) +            self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) +            self.try_rename(tmpfilename, filename) +            self._hook_progress({ +                'downloaded_bytes': fsize, +                'total_bytes': fsize, +                'filename': filename, +                'status': 'finished', +            }) +            return True +        else: +            self.to_stderr(u"\n") +            self.report_error(u'ffmpeg exited with code %d' % retval) +            return False +      def _do_download(self, filename, info_dict):          url = info_dict['url'] @@ -354,6 +387,10 @@ class FileDownloader(object):          if url.startswith('mms') or url.startswith('rtsp'):              return self._download_with_mplayer(filename, url) +        # m3u8 manifest are downloaded with ffmpeg +        if determine_ext(url) == u'm3u8': +            return self._download_m3u8_with_ffmpeg(filename, url) +          tmpfilename = self.temp_name(filename)          stream = None diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 8c5e53991..fddf58606 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -100,7 +100,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):          self._nopostoverwrites = nopostoverwrites      def get_audio_codec(self, path): -        if not self._exes['ffprobe'] and not self._exes['avprobe']: return None +        if not self._exes['ffprobe'] and not self._exes['avprobe']: +            raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.')          try:              cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', encodeFilename(self._ffmpeg_filename_argument(path))]              handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE) @@ -208,7 +209,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):              try:                  os.utime(encodeFilename(new_path), (time.time(), information['filetime']))              except: -                self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file') +                self._downloader.report_warning(u'Cannot update utime of audio file')          information['filepath'] = new_path          return self._nopostoverwrites,information diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e11d6f994..fa7bb1387 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -261,7 +261,7 @@ class YoutubeDL(object):              self.report_error(u'Erroneous output template')              return None          except ValueError as err: -            self.report_error(u'Insufficient system charset ' + repr(preferredencoding())) +            self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')              return None      def _match_entry(self, info_dict): @@ -535,7 +535,7 @@ class YoutubeDL(object):                  try:                      success = self.fd._do_download(filename, info_dict)                  except (OSError, IOError) as err: -                    raise UnavailableVideoError() +                    raise UnavailableVideoError(err)                  except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:                      self.report_error(u'unable to download video data: %s' % str(err))                      return @@ -582,7 +582,7 @@ class YoutubeDL(object):                          # No clear decision yet, let IE decide                          keep_video = keep_video_wish              except PostProcessingError as e: -                self.to_stderr(u'ERROR: ' + e.msg) +                self.report_error(e.msg)          if keep_video is False and not self.params.get('keepvideo', False):              try:                  self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 34f3dad0f..c21bf6d4a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -27,6 +27,7 @@ __authors__  = (      'Johny Mo Swag',      'Axel Noack',      'Albert Kim', +    'Pierre Rudloff',  )  __license__ = 'Public Domain' @@ -343,7 +344,7 @@ def parseOpts(overrideArguments=None):              userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')          systemConf = _readOptions('/etc/youtube-dl.conf')          userConf = _readOptions(userConfFile) -        commandLineConf = sys.argv[1:]  +        commandLineConf = sys.argv[1:]          argv = systemConf + userConf + commandLineConf          opts, args = parser.parse_args(argv)          if opts.verbose: @@ -377,7 +378,7 @@ def _real_main(argv=None):      # Set user agent      if opts.user_agent is not None:          std_headers['User-Agent'] = opts.user_agent -     +      # Set referer      if opts.referer is not None:          std_headers['Referer'] = opts.referer @@ -398,6 +399,8 @@ def _real_main(argv=None):              batchurls = batchfd.readlines()              batchurls = [x.strip() for x in batchurls]              batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] +            if opts.verbose: +                sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')          except IOError:              sys.exit(u'ERROR: batch file could not be read')      all_urls = batchurls + args diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b4a1c20e9..b4db8f0bf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -12,7 +12,7 @@ from .comedycentral import ComedyCentralIE  from .condenast import CondeNastIE  from .criterion import CriterionIE  from .cspan import CSpanIE -from .dailymotion import DailymotionIE +from .dailymotion import DailymotionIE, DailymotionPlaylistIE  from .depositfiles import DepositFilesIE  from .dotsub import DotsubIE  from .dreisat import DreiSatIE @@ -36,23 +36,31 @@ from .ign import IGNIE, OneUPIE  from .ina import InaIE  from .infoq import InfoQIE  from .instagram import InstagramIE +from .jeuxvideo import JeuxVideoIE  from .jukebox import JukeboxIE  from .justintv import JustinTVIE +from .kankan import KankanIE  from .keek import KeekIE  from .liveleak import LiveLeakIE  from .livestream import LivestreamIE  from .metacafe import MetacafeIE  from .mixcloud import MixcloudIE  from .mtv import MTVIE +from .muzu import MuzuTVIE  from .myspass import MySpassIE  from .myvideo import MyVideoIE  from .nba import NBAIE +from .ooyala import OoyalaIE +from .pbs import PBSIE  from .photobucket import PhotobucketIE  from .pornotube import PornotubeIE  from .rbmaradio import RBMARadioIE  from .redtube import RedTubeIE  from .ringtv import RingTVIE +from .roxwel import RoxwelIE +from .rtlnow import RTLnowIE  from .sina import SinaIE +from .slashdot import SlashdotIE  from .soundcloud import SoundcloudIE, SoundcloudSetIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE @@ -67,10 +75,12 @@ from .tudou import TudouIE  from .tumblr import TumblrIE  from .tutv import TutvIE  from .ustream import UstreamIE +from .unistra import UnistraIE  from .vbox7 import Vbox7IE  from .veoh import VeohIE  from .vevo import VevoIE -from .vimeo import VimeoIE +from .videofyme import VideofyMeIE +from .vimeo import VimeoIE, VimeoChannelIE  from .vine import VineIE  from .c56 import C56IE  from .wat import WatIE @@ -92,6 +102,9 @@ from .youtube import (      YoutubeChannelIE,      YoutubeShowIE,      YoutubeSubscriptionsIE, +    YoutubeRecommendedIE, +    YoutubeWatchLaterIE, +    YoutubeFavouritesIE,  )  from .zdf import ZDFIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 993e30f7a..69b3b0ad7 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -17,13 +17,14 @@ class ArteTvIE(InfoExtractor):      """      _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'      _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' +    _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'      _LIVE_URL = r'index-[0-9]+\.html$'      IE_NAME = u'arte.tv'      @classmethod      def suitable(cls, url): -        return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) +        return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))      # TODO implement Live Stream      # from ..utils import compat_urllib_parse @@ -68,6 +69,12 @@ class ArteTvIE(InfoExtractor):              lang = mobj.group('lang')              return self._extract_video(url, id, lang) +        mobj = re.match(self._LIVEWEB_URL, url) +        if mobj is not None: +            name = mobj.group('name') +            lang = mobj.group('lang') +            return self._extract_liveweb(url, name, lang) +          if re.search(self._LIVE_URL, video_id) is not None:              raise ExtractorError(u'Arte live streams are not yet supported, sorry')              # self.extractLiveStream(url) @@ -85,7 +92,7 @@ class ArteTvIE(InfoExtractor):          info_dict = {'id': player_info['VID'],                       'title': player_info['VTI'], -                     'description': player_info['VDE'], +                     'description': player_info.get('VDE'),                       'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),                       'thumbnail': player_info['programImage'],                       'ext': 'flv', @@ -98,12 +105,14 @@ class ArteTvIE(InfoExtractor):                  l = 'F'              elif lang == 'de':                  l = 'A' -            regexes = [r'VO?%s' % l, r'V%s-ST.' % l] +            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]              return any(re.match(r, f['versionCode']) for r in regexes)          # Some formats may not be in the same language as the url          formats = filter(_match_lang, formats)          # We order the formats by quality          formats = sorted(formats, key=lambda f: int(f['height'])) +        # Prefer videos without subtitles in the same language +        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)          # Pick the best quality          format_info = formats[-1]          if format_info['mediaType'] == u'rtmp': @@ -144,3 +153,22 @@ class ArteTvIE(InfoExtractor):                  'url': video_url,                  'ext': 'flv',                  } + +    def _extract_liveweb(self, url, name, lang): +        """Extract form http://liveweb.arte.tv/""" +        webpage = self._download_webpage(url, name) +        video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') +        config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, +                                            video_id, u'Downloading information') +        config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) +        event_doc = config_doc.find('event') +        url_node = event_doc.find('video').find('urlHd') +        if url_node is None: +            url_node = video_doc.find('urlSd') + +        return {'id': video_id, +                'title': event_doc.find('name%s' % lang.capitalize()).text, +                'url': url_node.text.replace('MP4', 'mp4'), +                'ext': 'flv', +                'thumbnail': self._og_search_thumbnail(webpage), +                } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 34f555e89..53a898de3 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -1,6 +1,8 @@  import re +import json  from .common import InfoExtractor +from ..utils import determine_ext  class BreakIE(InfoExtractor): @@ -17,17 +19,20 @@ class BreakIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group(1).split("-")[-1] -        webpage = self._download_webpage(url, video_id) -        video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) -        key = re.search(r"icon: '(.+?)',",webpage).group(1) -        final_url = str(video_url)+"?"+str(key) -        thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) -        title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) -        ext = video_url.split('.')[-1] +        embed_url = 'http://www.break.com/embed/%s' % video_id +        webpage = self._download_webpage(embed_url, video_id) +        info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, +                                       u'info json', flags=re.DOTALL) +        info = json.loads(info_json) +        video_url = info['videoUri'] +        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) +        if m_youtube is not None: +            return self.url_result(m_youtube.group(1), 'Youtube') +        final_url = video_url + '?' + info['AuthToken']          return [{              'id':        video_id,              'url':       final_url, -            'ext':       ext, -            'title':     title, -            'thumbnail': thumbnail_url, +            'ext':       determine_ext(final_url), +            'title':     info['contentName'], +            'thumbnail': info['thumbUri'],          }] diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 7ae0972e5..8d4c93d6d 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,26 +1,36 @@  import re -import socket  import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import ( -    compat_http_client, -    compat_str, -    compat_urllib_error,      compat_urllib_parse_urlparse, -    compat_urllib_request, +    determine_ext,      ExtractorError,  )  class CollegeHumorIE(InfoExtractor): -    _WORKING = False -    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$' +    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' -    def report_manifest(self, video_id): -        """Report information extraction.""" -        self.to_screen(u'%s: Downloading XML manifest' % video_id) +    _TESTS = [{ +        u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', +        u'file': u'6902724.mp4', +        u'md5': u'1264c12ad95dca142a9f0bf7968105a0', +        u'info_dict': { +            u'title': u'Comic-Con Cosplay Catastrophe', +            u'description': u'Fans get creative this year at San Diego.  Too creative.  And yes, that\'s really Joss Whedon.', +        }, +    }, +    { +        u'url': u'http://www.collegehumor.com/video/3505939/font-conference', +        u'file': u'3505939.mp4', +        u'md5': u'c51ca16b82bb456a4397987791a835f5', +        u'info_dict': { +            u'title': u'Font Conference', +            u'description': u'This video wasn\'t long enough, so we made it double-spaced.', +        }, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -36,39 +46,42 @@ class CollegeHumorIE(InfoExtractor):          self.report_extraction(video_id)          xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id -        try: -            metaXml = compat_urllib_request.urlopen(xmlUrl).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) +        metaXml = self._download_webpage(xmlUrl, video_id, +                                         u'Downloading info XML', +                                         u'Unable to download video info XML')          mdoc = xml.etree.ElementTree.fromstring(metaXml)          try:              videoNode = mdoc.findall('./video')[0] +            youtubeIdNode = videoNode.find('./youtubeID') +            if youtubeIdNode is not None: +                return self.url_result(youtubeIdNode.text, 'Youtube')              info['description'] = videoNode.findall('./description')[0].text              info['title'] = videoNode.findall('./caption')[0].text              info['thumbnail'] = videoNode.findall('./thumbnail')[0].text -            manifest_url = videoNode.findall('./file')[0].text +            next_url = videoNode.findall('./file')[0].text          except IndexError:              raise ExtractorError(u'Invalid metadata XML file') -        manifest_url += '?hdcore=2.10.3' -        self.report_manifest(video_id) -        try: -            manifestXml = compat_urllib_request.urlopen(manifest_url).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) - -        adoc = xml.etree.ElementTree.fromstring(manifestXml) -        try: -            media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] -            node_id = media_node.attrib['url'] -            video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text -        except IndexError as err: -            raise ExtractorError(u'Invalid manifest file') +        if next_url.endswith(u'manifest.f4m'): +            manifest_url = next_url + '?hdcore=2.10.3' +            manifestXml = self._download_webpage(manifest_url, video_id, +                                         u'Downloading XML manifest', +                                         u'Unable to download video info XML') -        url_pr = compat_urllib_parse_urlparse(manifest_url) -        url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' +            adoc = xml.etree.ElementTree.fromstring(manifestXml) +            try: +                media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] +                node_id = media_node.attrib['url'] +                video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text +            except IndexError as err: +                raise ExtractorError(u'Invalid manifest file') +            url_pr = compat_urllib_parse_urlparse(info['thumbnail']) +            info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') +            info['ext'] = 'mp4' +        else: +            # Old-style direct links +            info['url'] = next_url +            info['ext'] = determine_ext(info['url']) -        info['url'] = url -        info['ext'] = 'f4f' -        return [info] +        return info diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 93d9e3d5e..bf8d711ee 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor):                           (full-episodes/(?P<episode>.*)|                            (?P<clip>                                (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) -                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))))) +                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))| +                          (?P<interview> +                              extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))                       $"""      _TEST = {          u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', @@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor):              else:                  epTitle = mobj.group('cntitle')              dlNewest = False +        elif mobj.group('interview'): +            epTitle = mobj.group('interview_title') +            dlNewest = False          else:              dlNewest = not mobj.group('episode')              if dlNewest: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e2e192bef..52c4483c9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -78,7 +78,13 @@ class InfoExtractor(object):      @classmethod      def suitable(cls, url):          """Receives a URL and returns True if suitable for this IE.""" -        return re.match(cls._VALID_URL, url) is not None + +        # This does not use has/getattr intentionally - we want to know whether +        # we have cached the regexp for *this* class, whereas getattr would also +        # match the superclass +        if '_VALID_URL_RE' not in cls.__dict__: +            cls._VALID_URL_RE = re.compile(cls._VALID_URL) +        return cls._VALID_URL_RE.match(url) is not None      @classmethod      def working(cls): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8fab16005..f54ecc569 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -82,8 +82,8 @@ class DailymotionIE(DailyMotionSubtitlesIE):          # TODO: support choosing qualities -        for key in ['stream_h264_hd1080_url', 'stream_h264_hd_url', -                    'stream_h264_hq_url', 'stream_h264_url', +        for key in ['stream_h264_hd1080_url','stream_h264_hd_url', +                    'stream_h264_hq_url','stream_h264_url',                      'stream_h264_ld_url']:              if info.get(key):  # key in info and info[key]:                  max_quality = key @@ -116,3 +116,31 @@ class DailymotionIE(DailyMotionSubtitlesIE):              'subtitles':    video_subtitles,              'thumbnail': info['thumbnail_url']          }] + + +class DailymotionPlaylistIE(InfoExtractor): +    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' +    _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        playlist_id =  mobj.group('id') +        video_ids = [] + +        for pagenum in itertools.count(1): +            webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), +                                             playlist_id, u'Downloading page %s' % pagenum) + +            playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) +            video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + +            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: +                break + +        entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') +                   for video_id in video_ids] +        return {'_type': 'playlist', +                'id': playlist_id, +                'title': get_element_by_id(u'playlist_name', webpage), +                'entries': entries, +                } diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index fe1582d1a..3443f19c5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,17 +8,30 @@ class ExfmIE(InfoExtractor):      IE_NAME = u'exfm'      IE_DESC = u'ex.fm'      _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' -    _SOUNDCLOUD_URL_ = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' -    _TEST = { -        u'url': u'http://ex.fm/song/1bgtzg', -        u'file': u'1bgtzg.mp3', -        u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', -        u'info_dict': { -            u"title": u"We Can't Stop", -            u"uploader": u"Miley Cyrus", -            u'thumbnail': u'http://i1.sndcdn.com/artworks-000049666230-w9i7ef-t500x500.jpg?9d68d37' -        } -    } +    _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' +    _TESTS = [ +        { +            u'url': u'http://ex.fm/song/1bgtzg', +            u'file': u'95223130.mp3', +            u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', +            u'info_dict': { +                u"title": u"We Can't Stop - Miley Cyrus", +                u"uploader": u"Miley Cyrus", +                u'upload_date': u'20130603', +                u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC', +            }, +            u'note': u'Soundcloud song', +        }, +        { +            u'url': u'http://ex.fm/song/wddt8', +            u'file': u'wddt8.mp3', +            u'md5': u'966bd70741ac5b8570d8e45bfaed3643', +            u'info_dict': { +                u'title': u'Safe and Sound', +                u'uploader': u'Capital Cities', +            }, +        }, +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -26,11 +39,10 @@ class ExfmIE(InfoExtractor):          info_url = "http://ex.fm/api/v3/song/%s" %(song_id)          webpage = self._download_webpage(info_url, song_id)          info = json.loads(webpage) -        song_url = re.match(self._SOUNDCLOUD_URL_,info['song']['url']) -        if song_url is not None: -        	song_url = song_url.group() + "?client_id=b45b1aa10f1ac2941910a7f0d10f8e28" -        else: -        	song_url = info['song']['url'] +        song_url = info['song']['url'] +        if re.match(self._SOUNDCLOUD_URL, song_url) is not None: +            self.to_screen('Soundcloud song detected') +            return self.url_result(song_url.replace('/stream',''), 'Soundcloud')          return [{              'id':          song_id,              'url':         song_url, diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 67a7e5f76..4508f0dfa 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,17 +21,14 @@ class FunnyOrDieIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', +        video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"',              webpage, u'video URL', flags=re.DOTALL) -        title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", -            r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL) -          info = {              'id': video_id,              'url': video_url,              'ext': 'mp4', -            'title': title, +            'title': self._og_search_title(webpage),              'description': self._og_search_description(webpage),          }          return [info] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b633e896c..da016f7ee 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,8 +107,13 @@ class GenericIE(InfoExtractor):          return new_url      def _real_extract(self, url): -        new_url = self._test_redirect(url) -        if new_url: return [self.url_result(new_url)] +        try: +            new_url = self._test_redirect(url) +            if new_url: +                return [self.url_result(new_url)] +        except compat_urllib_error.HTTPError: +            # This may be a stupid server that doesn't like HEAD, our UA, or so +            pass          video_id = url.split('/')[-1]          try: @@ -145,6 +150,9 @@ class GenericIE(InfoExtractor):              if m_video_type is not None:                  mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)          if mobj is None: +            # HTML5 video +            mobj = re.search(r'<video[^<]*>.*?<source .*?src="([^"]+)"', webpage, flags=re.DOTALL) +        if mobj is None:              raise ExtractorError(u'Invalid URL: %s' % url)          # It's possible that one of the regexes diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 962c59214..652f19b7b 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -5,7 +5,7 @@ from .common import InfoExtractor  class InaIE(InfoExtractor):      """Information Extractor for Ina.fr""" -    _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' +    _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*'      _TEST = {          u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',          u'file': u'I12055569.mp4', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py new file mode 100644 index 000000000..4327bc13d --- /dev/null +++ b/youtube_dl/extractor/jeuxvideo.py @@ -0,0 +1,47 @@ +# coding: utf-8 + +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + +class JeuxVideoIE(InfoExtractor): +    _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' + +    _TEST = { +        u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', +        u'file': u'5182.mp4', +        u'md5': u'e0fdb0cd3ce98713ef9c1e1e025779d0', +        u'info_dict': { +            u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité', +            u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        title = re.match(self._VALID_URL, url).group(1) +        webpage = self._download_webpage(url, title) +        m_download = re.search(r'<param name="flashvars" value="config=(.*?)" />', webpage) + +        xml_link = m_download.group(1) +         +        id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + +        xml_config = self._download_webpage(xml_link, title, +                                                  'Downloading XML config') +        config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) +        info = re.search(r'<format\.json>(.*?)</format\.json>', +                         xml_config, re.MULTILINE|re.DOTALL).group(1) +        info = json.loads(info)['versions'][0] +         +        video_url = 'http://video720.jeuxvideo.com/' + info['file'] + +        return {'id': id, +                'title' : config.find('titre_video').text, +                'ext' : 'mp4', +                'url' : video_url, +                'description': self._og_search_description(webpage), +                'thumbnail': config.find('image').text, +                } diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py new file mode 100644 index 000000000..8537ba584 --- /dev/null +++ b/youtube_dl/extractor/kankan.py @@ -0,0 +1,37 @@ +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class KankanIE(InfoExtractor): +    _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' +     +    _TEST = { +        u'url': u'http://yinyue.kankan.com/vod/48/48863.shtml', +        u'file': u'48863.flv', +        u'md5': u'29aca1e47ae68fc28804aca89f29507e', +        u'info_dict': { +            u'title': u'Ready To Go', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) + +        title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') +        gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + +        video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, +                                                 video_id, u'Downloading video url info') +        ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip') +        path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') +        video_url = 'http://%s%s' % (ip, path) + +        return {'id': video_id, +                'title': title, +                'url': video_url, +                'ext': determine_ext(video_url), +                } diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index dda78743d..a7b88d2d9 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -4,10 +4,10 @@ from .common import InfoExtractor  class KeekIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)' +    _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'      IE_NAME = u'keek'      _TEST = { -        u'url': u'http://www.keek.com/ytdl/keeks/NODfbab', +        u'url': u'https://www.keek.com/ytdl/keeks/NODfbab',          u'file': u'NODfbab.mp4',          u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83',          u'info_dict': { diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py new file mode 100644 index 000000000..03e31ea1c --- /dev/null +++ b/youtube_dl/extractor/muzu.py @@ -0,0 +1,64 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse, +    determine_ext, +) + + +class MuzuTVIE(InfoExtractor): +    _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)' +    IE_NAME = u'muzu.tv' + +    _TEST = { +        u'url': u'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', +        u'file': u'1981454.mp4', +        u'md5': u'98f8b2c7bc50578d6a0364fff2bfb000', +        u'info_dict': { +            u'title': u'Cat Walk (Original Mix)', +            u'description': u'md5:90e868994de201b2570e4e5854e19420', +            u'uploader': u'MarcAshken featuring SOS', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        info_data = compat_urllib_parse.urlencode({'format': 'json', +                                                   'url': url, +                                                   }) +        video_info_page = self._download_webpage('http://www.muzu.tv/api/oembed/?%s' % info_data, +                                                 video_id, u'Downloading video info') +        info = json.loads(video_info_page) + +        player_info_page = self._download_webpage('http://player.muzu.tv/player/playerInit?ai=%s' % video_id, +                                                  video_id, u'Downloading player info') +        video_info = json.loads(player_info_page)['videos'][0] +        for quality in ['1080' , '720', '480', '360']: +            if video_info.get('v%s' % quality): +                break + +        data = compat_urllib_parse.urlencode({'ai': video_id, +                                              # Even if each time you watch a video the hash changes, +                                              # it seems to work for different videos, and it will work +                                              # even if you use any non empty string as a hash +                                              'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', +                                              'device': 'web', +                                              'qv': quality, +                                              }) +        video_url_page = self._download_webpage('http://player.muzu.tv/player/requestVideo?%s' % data, +                                                video_id, u'Downloading video url') +        video_url_info = json.loads(video_url_page) +        video_url = video_url_info['url'] + +        return {'id': video_id, +                'title': info['title'], +                'url': video_url, +                'ext': determine_ext(video_url), +                'thumbnail': info['thumbnail_url'], +                'description': info['description'], +                'uploader': info['author_name'], +                } diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index b2a7b1df0..0404e6e43 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -2,11 +2,13 @@ import binascii  import base64  import hashlib  import re +import json  from .common import InfoExtractor  from ..utils import (      compat_ord,      compat_urllib_parse, +    compat_urllib_request,      ExtractorError,  ) @@ -16,7 +18,7 @@ from ..utils import (  class MyVideoIE(InfoExtractor):      """Information Extractor for myvideo.de.""" -    _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' +    _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*'      IE_NAME = u'myvideo'      _TEST = {          u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', @@ -85,6 +87,20 @@ class MyVideoIE(InfoExtractor):                  'ext':      video_ext,              }] +        mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) +        if mobj is not None: +            request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') +            response = self._download_webpage(request, video_id, +                                              u'Downloading video info') +            info = json.loads(base64.b64decode(response).decode('utf-8')) +            return {'id': video_id, +                    'title': info['title'], +                    'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), +                    'play_path': info['filename'], +                    'ext': 'flv', +                    'thumbnail': info['thumbnail'][0]['url'], +                    } +          # try encxml          mobj = re.search('var flashvars={(.+?)}', webpage)          if mobj is None: diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py new file mode 100644 index 000000000..b734722d0 --- /dev/null +++ b/youtube_dl/extractor/ooyala.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unescapeHTML + +class OoyalaIE(InfoExtractor): +    _VALID_URL = r'https?://.+?\.ooyala\.com/.*?embedCode=(?P<id>.+?)(&|$)' + +    _TEST = { +        # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video +        u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', +        u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4', +        u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c', +        u'info_dict': { +            u'title': u'Explaining Data Recovery from Hard Drives and SSDs', +            u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', +        }, +    } + +    def _extract_result(self, info, more_info): +        return {'id': info['embedCode'], +                'ext': 'mp4', +                'title': unescapeHTML(info['title']), +                'url': info['url'], +                'description': unescapeHTML(more_info['description']), +                'thumbnail': more_info['promo'], +                } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        embedCode = mobj.group('id') +        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode +        player = self._download_webpage(player_url, embedCode) +        mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', +                                        player, u'mobile player url') +        mobile_player = self._download_webpage(mobile_url, embedCode) +        videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') +        videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') +        videos_info = json.loads(videos_info) +        videos_more_info =json.loads(videos_more_info) + +        if videos_more_info.get('lineup'): +            videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] +            return {'_type': 'playlist', +                    'id': embedCode, +                    'title': unescapeHTML(videos_more_info['title']), +                    'entries': videos, +                    } +        else: +            return self._extract_result(videos_info[0], videos_more_info) +         diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py new file mode 100644 index 000000000..65462d867 --- /dev/null +++ b/youtube_dl/extractor/pbs.py @@ -0,0 +1,34 @@ +import re +import json + +from .common import InfoExtractor + + +class PBSIE(InfoExtractor): +    _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?' + +    _TEST = { +        u'url': u'http://video.pbs.org/video/2365006249/', +        u'file': u'2365006249.mp4', +        u'md5': 'ce1888486f0908d555a8093cac9a7362', +        u'info_dict': { +            u'title': u'A More Perfect Union', +            u'description': u'md5:ba0c207295339c8d6eced00b7c363c6a', +            u'duration': 3190, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id +        info_page = self._download_webpage(info_url, video_id) +        info =json.loads(info_page) +        return {'id': video_id, +                'title': info['title'], +                'url': info['alternate_encoding']['url'], +                'ext': 'mp4', +                'description': info['program'].get('description'), +                'thumbnail': info.get('image_url'), +                'duration': info.get('duration'), +                } diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py new file mode 100644 index 000000000..d339e6cb5 --- /dev/null +++ b/youtube_dl/extractor/roxwel.py @@ -0,0 +1,49 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): +    _VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' + +    _TEST = { +        u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', +        u'file': u'passionpittakeawalklive.flv', +        u'md5': u'd9dea8360a1e7d485d2206db7fe13035', +        u'info_dict': { +            u'title': u'Take A Walk (live)', +            u'uploader': u'Passion Pit', +            u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', +        }, +        u'skip': u'Requires rtmpdump', +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        filename = mobj.group('filename') +        info_url = 'http://www.roxwel.com/api/videos/%s' % filename +        info_page = self._download_webpage(info_url, filename, +                                           u'Downloading video info') + +        self.report_extraction(filename) +        info = json.loads(info_page) +        rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) +        best_rate = rtmp_rates[-1] +        url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) +        rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') +        ext = determine_ext(rtmp_url) +        if ext == 'f4v': +            rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + +        return {'id': filename, +                'title': info['title'], +                'url': rtmp_url, +                'ext': 'flv', +                'description': info['description'], +                'thumbnail': info.get('player_image_url') or info.get('image_url_large'), +                'uploader': info['artist'], +                'uploader_id': info['artistname'], +                'upload_date': unified_strdate(info['dbdate']), +                } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py new file mode 100644 index 000000000..2f134e6a7 --- /dev/null +++ b/youtube_dl/extractor/rtlnow.py @@ -0,0 +1,113 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    ExtractorError, +) + +class RTLnowIE(InfoExtractor): +    """Information Extractor for RTLnow, RTL2now and VOXnow""" +    _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl(?:(?P<is_rtl2>2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' +    _TESTS = [{ +        u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', +        u'file': u'90419.flv', +        u'info_dict': { +            u'upload_date': u'20070416',  +            u'title': u'Ahornallee - Folge 1 - Der Einzug', +            u'description': u'Folge 1 - Der Einzug', +        }, +        u'params': { +            u'skip_download': True, +        }, +        u'skip': u'Only works from Germany', +    }, +    { +        u'url': u'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', +        u'file': u'69756.flv', +        u'info_dict': { +            u'upload_date': u'20120519',  +            u'title': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', +            u'description': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', +            u'thumbnail': u'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', +        }, +        u'params': { +            u'skip_download': True, +        }, +        u'skip': u'Only works from Germany', +    }, +    { +        u'url': u'www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', +        u'file': u'13883.flv', +        u'info_dict': { +            u'upload_date': u'20090627',  +            u'title': u'Voxtours - Südafrika-Reporter II', +            u'description': u'Südafrika-Reporter II', +        }, +        u'params': { +            u'skip_download': True, +        }, +    }] + +    def _real_extract(self,url): +        mobj = re.match(self._VALID_URL, url) + +        webpage_url = u'http://' + mobj.group('url') +        video_page_url = u'http://' + mobj.group('base_url') +        video_id = mobj.group(u'video_id') + +        webpage = self._download_webpage(webpage_url, video_id) + +        note_m = re.search(r'''(?sx) +            <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?) +            <div[ ]id="playerteaser">''', webpage) +        if note_m: +            msg = clean_html(note_m.group(1)) +            raise ExtractorError(msg) + +        video_title = self._html_search_regex(r'<title>(?P<title>[^<]+)</title>', +            webpage, u'title') +        playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'', +            webpage, u'playerdata_url') + +        playerdata = self._download_webpage(playerdata_url, video_id) +        mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr\]\]></title>', playerdata) +        if mobj: +            video_description = mobj.group(u'description') +            if mobj.group('upload_date_Y'): +                video_upload_date = mobj.group('upload_date_Y') +            else: +                video_upload_date = u'20' + mobj.group('upload_date_y') +            video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') +        else: +            video_description = None +            video_upload_date = None +            self._downloader.report_warning(u'Unable to extract description and upload date') + +        # Thumbnail: not every video has an thumbnail +        mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage) +        if mobj: +            video_thumbnail = mobj.group(u'thumbnail') +        else: +            video_thumbnail = None + +        mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata) +        if mobj is None: +            raise ExtractorError(u'Unable to extract media URL') +        video_url = mobj.group(u'url') +        video_play_path = u'mp4:' + mobj.group(u'play_path') +        video_player_url = video_page_url + u'includes/vodplayer.swf' + +        return [{ +            'id':          video_id, +            'url':         video_url, +            'play_path':   video_play_path, +            'page_url':    video_page_url, +            'player_url':  video_player_url, +            'ext':         'flv', +            'title':       video_title, +            'description': video_description, +            'upload_date': video_upload_date, +            'thumbnail':   video_thumbnail, +        }] diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py new file mode 100644 index 000000000..2cba53076 --- /dev/null +++ b/youtube_dl/extractor/slashdot.py @@ -0,0 +1,23 @@ +import re + +from .common import InfoExtractor + + +class SlashdotIE(InfoExtractor): +    _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)' + +    _TEST = { +        u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz', +        u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4', +        u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', +        u'info_dict': { +            u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        ooyala_url = self._search_regex(r'<script src="(.*?)"', webpage, 'ooyala url') +        return self.url_result(ooyala_url, 'Ooyala') diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d47c49c03..5f3a5540d 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -4,6 +4,7 @@ import re  from .common import InfoExtractor  from ..utils import (      compat_str, +    compat_urlparse,      ExtractorError,      unified_strdate, @@ -19,7 +20,12 @@ class SoundcloudIE(InfoExtractor):         of the stream token and uid       """ -    _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)(?:[?].*)?$' +    _VALID_URL = r'''^(?:https?://)? +                    (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) +                       |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) +                       |(?P<widget>w.soundcloud.com/player/?.*?url=.*) +                    ) +                    '''      IE_NAME = u'soundcloud'      _TEST = {          u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', @@ -33,59 +39,68 @@ class SoundcloudIE(InfoExtractor):          }      } +    _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + +    @classmethod +    def suitable(cls, url): +        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None +      def report_resolve(self, video_id):          """Report information extraction."""          self.to_screen(u'%s: Resolving id' % video_id) -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) - -        # extract uploader (which is in the url) -        uploader = mobj.group(1) -        # extract simple title (uploader + slug of song title) -        slug_title =  mobj.group(2) -        full_title = '%s/%s' % (uploader, slug_title) - -        self.report_resolve(full_title) - -        url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) -        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' -        info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON') +    @classmethod +    def _resolv_url(cls, url): +        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID -        info = json.loads(info_json) +    def _extract_info_dict(self, info, full_title=None):          video_id = info['id'] -        self.report_extraction(full_title) - -        streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' -        stream_json = self._download_webpage(streams_url, full_title, -                                             u'Downloading stream definitions', -                                             u'unable to download stream definitions') - -        streams = json.loads(stream_json) -        mediaURL = streams['http_mp3_128_url'] -        upload_date = unified_strdate(info['created_at']) +        name = full_title or video_id +        self.report_extraction(name) -        return [{ +        thumbnail = info['artwork_url'] +        if thumbnail is not None: +            thumbnail = thumbnail.replace('-large', '-t500x500') +        return {              'id':       info['id'], -            'url':      mediaURL, +            'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,              'uploader': info['user']['username'], -            'upload_date': upload_date, +            'upload_date': unified_strdate(info['created_at']),              'title':    info['title'],              'ext':      u'mp3',              'description': info['description'], -        }] +            'thumbnail': thumbnail, +        } -class SoundcloudSetIE(InfoExtractor): -    """Information extractor for soundcloud.com sets -       To access the media, the uid of the song and a stream token -       must be extracted from the page source and the script must make -       a request to media.soundcloud.com/crossdomain.xml. Then -       the media can be grabbed by requesting from an url composed -       of the stream token and uid -     """ +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) +        if mobj is None: +            raise ExtractorError(u'Invalid URL: %s' % url) + +        track_id = mobj.group('track_id') +        if track_id is not None: +            info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID +            full_title = track_id +        elif mobj.group('widget'): +            query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) +            return self.url_result(query['url'][0], ie='Soundcloud') +        else: +            # extract uploader (which is in the url) +            uploader = mobj.group(1) +            # extract simple title (uploader + slug of song title) +            slug_title =  mobj.group(2) +            full_title = '%s/%s' % (uploader, slug_title) +     +            self.report_resolve(full_title) +     +            url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) +            info_json_url = self._resolv_url(url) +        info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON') +        info = json.loads(info_json) +        return self._extract_info_dict(info, full_title) + +class SoundcloudSetIE(SoundcloudIE):      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'      IE_NAME = u'soundcloud:set'      _TEST = { @@ -153,10 +168,6 @@ class SoundcloudSetIE(InfoExtractor):          ]      } -    def report_resolve(self, video_id): -        """Report information extraction.""" -        self.to_screen(u'%s: Resolving id' % video_id) -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          if mobj is None: @@ -171,7 +182,7 @@ class SoundcloudSetIE(InfoExtractor):          self.report_resolve(full_title)          url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) -        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' +        resolv_url = self._resolv_url(url)          info_json = self._download_webpage(resolv_url, full_title)          videos = [] @@ -182,23 +193,8 @@ class SoundcloudSetIE(InfoExtractor):              return          self.report_extraction(full_title) -        for track in info['tracks']: -            video_id = track['id'] - -            streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' -            stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON') - -            self.report_extraction(video_id) -            streams = json.loads(stream_json) -            mediaURL = streams['http_mp3_128_url'] - -            videos.append({ -                'id':       video_id, -                'url':      mediaURL, -                'uploader': track['user']['username'], -                'upload_date':  unified_strdate(track['created_at']), -                'title':    track['title'], -                'ext':      u'mp3', -                'description': track['description'], -            }) -        return videos +        return {'_type': 'playlist', +                'entries': [self._extract_info_dict(track) for track in info['tracks']], +                'id': info['id'], +                'title': info['title'], +                } diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index b8e6b3bf9..1ea4a9f2f 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -5,13 +5,13 @@ from .common import InfoExtractor  class StatigramIE(InfoExtractor):      _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'      _TEST = { -        u'url': u'http://statigr.am/p/484091715184808010_284179915', -        u'file': u'484091715184808010_284179915.mp4', -        u'md5': u'deda4ff333abe2e118740321e992605b', +        u'url': u'http://statigr.am/p/522207370455279102_24101272', +        u'file': u'522207370455279102_24101272.mp4', +        u'md5': u'6eb93b882a3ded7c378ee1d6884b1814',          u'info_dict': { -            u"uploader_id": u"videoseconds",  -            u"title": u"Instagram photo by @videoseconds" -        } +            u'uploader_id': u'aguynamedpatrick', +            u'title': u'Instagram photo by @aguynamedpatrick (Patrick Janelle)', +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index ec92e589a..c910110ca 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -33,7 +33,7 @@ class TeamcocoIE(InfoExtractor):          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id          data = self._download_webpage(data_url, video_id, 'Downloading data webpage') -        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>', +        video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>',              data, u'video URL')          return [{ diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e0ffeced5..772134a12 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,19 +6,17 @@ import re  from .common import InfoExtractor  class TF1IE(InfoExtractor): -    """ -    TF1 uses the wat.tv player, currently it can only download videos with the -    html5 player enabled, it cannot download HD videos. -    """ +    """TF1 uses the wat.tv player."""      _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'      _TEST = {          u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',          u'file': u'10635995.mp4', -        u'md5': u'66789d3e91278d332f75e1feb7aea327', +        u'md5': u'2e378cc28b9957607d5e88f274e637d8',          u'info_dict': {              u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',              u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', -        } +        }, +        u'skip': u'Sometimes wat serves the whole file with the --test option',      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 324bb6231..35f89e9ee 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -4,11 +4,11 @@ from .common import InfoExtractor  class TrailerAddictIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:trailer|feature-trailer)' +    _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'      _TEST = {          u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer',          u'file': u'76184.mp4', -        u'md5': u'41365557f3c8c397d091da510e73ceb4', +        u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71',          u'info_dict': {              u"title": u"Prince Avalanche Trailer",              u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind." @@ -17,24 +17,30 @@ class TrailerAddictIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) -        webpage = self._download_webpage(url, video_id) -         +        name = mobj.group('movie') + '/' + mobj.group('trailer_name') +        webpage = self._download_webpage(url, name) +          title = self._search_regex(r'<title>(.+?)</title>',                  webpage, 'video title').replace(' - Trailer Addict','')          view_count = self._search_regex(r'Views: (.+?)<br />',                  webpage, 'Views Count')          video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] -        info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id)) +        # Presence of (no)watchplus function indicates HD quality is available +        if re.search(r'function (no)?watchplus()', webpage): +            fvar = "fvarhd" +        else: +            fvar = "fvar" + +        info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id))          info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") -         +          final_url = self._search_regex(r'&fileurl=(.+)',                  info_webpage, 'Download url').replace('%3F','?')          thumbnail_url = self._search_regex(r'&image=(.+?)&',                  info_webpage, 'thumbnail url')          ext = final_url.split('.')[-1].split('?')[0] -         +          return [{              'id'          : video_id,              'url'         : final_url, diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py new file mode 100644 index 000000000..5ba0a9061 --- /dev/null +++ b/youtube_dl/extractor/unistra.py @@ -0,0 +1,32 @@ +import re + +from .common import InfoExtractor + +class UnistraIE(InfoExtractor): +    _VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)' + +    _TEST = { +        u'url': u'http://utv.unistra.fr/video.php?id_video=154', +        u'file': u'154.mp4', +        u'md5': u'736f605cfdc96724d55bb543ab3ced24', +        u'info_dict': { +            u'title': u'M!ss Yella', +            u'description': u'md5:75e8439a3e2981cd5d4b6db232e8fdfc', +        }, +    } + +    def _real_extract(self, url): +        id = re.match(self._VALID_URL, url).group(1) +        webpage = self._download_webpage(url, id) +        file = re.search(r'file: "(.*?)",', webpage).group(1) +        title = self._html_search_regex(r'<title>UTV - (.*?)</', webpage, u'title') + +        video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file + +        return {'id': id, +                'title': title, +                'ext': 'mp4', +                'url': video_url, +                'description': self._html_search_regex(r'<meta name="Description" content="(.*?)"', webpage, u'description', flags=re.DOTALL), +                'thumbnail': self._search_regex(r'image: "(.*?)"', webpage, u'thumbnail'), +                } diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3b16dcfbc..70408c4f0 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -8,18 +8,18 @@ from ..utils import (  class VevoIE(InfoExtractor):      """ -    Accecps urls from vevo.com or in the format 'vevo:{id}' +    Accepts urls from vevo.com or in the format 'vevo:{id}'      (currently used by MTVIE)      """ -    _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*)$' +    _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'      _TEST = {          u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',          u'file': u'GB1101300280.mp4',          u'md5': u'06bea460acb744eab74a9d7dcb4bfd61',          u'info_dict': { -            u"upload_date": u"20130624",  -            u"uploader": u"Hurts",  -            u"title": u"Somebody To Die For" +            u"upload_date": u"20130624", +            u"uploader": u"Hurts", +            u"title": u"Somebody to Die For"          }      } @@ -35,12 +35,12 @@ class VevoIE(InfoExtractor):          self.report_extraction(video_id)          video_info = json.loads(info_json) -        m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):(?P<url>.*?)"', links_webpage)) +        m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):/?(?P<url>.*?)"', links_webpage))          if m_urls is None or len(m_urls) == 0:              raise ExtractorError(u'Unable to extract video url')          # They are sorted from worst to best quality          m_url = m_urls[-1] -        video_url = base_url + m_url.group('url') +        video_url = base_url + '/' + m_url.group('url')          ext = m_url.group('ext')          return {'url': video_url, diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py new file mode 100644 index 000000000..04106672b --- /dev/null +++ b/youtube_dl/extractor/videofyme.py @@ -0,0 +1,49 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( +    find_xpath_attr, +    determine_ext, +) + +class VideofyMeIE(InfoExtractor): +    _VALID_URL = r'https?://(www.videofy.me/.+?|p.videofy.me/v)/(?P<id>\d+)(&|#|$)' +    IE_NAME = u'videofy.me' + +    _TEST = { +        u'url': u'http://www.videofy.me/thisisvideofyme/1100701', +        u'file':  u'1100701.mp4', +        u'md5': u'2046dd5758541d630bfa93e741e2fd79', +        u'info_dict': { +            u'title': u'This is VideofyMe', +            u'description': None, +            u'uploader': u'VideofyMe', +            u'uploader_id': u'thisisvideofyme', +        }, +         +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id, +                                            video_id) +        config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) +        video = config.find('video') +        sources = video.find('sources') +        url_node = find_xpath_attr(sources, 'source', 'id', 'HQ on') +        if url_node is None: +            url_node = find_xpath_attr(sources, 'source', 'id', 'HQ off') +        video_url = url_node.find('url').text + +        return {'id': video_id, +                'title': video.find('title').text, +                'url': video_url, +                'ext': determine_ext(video_url), +                'thumbnail': video.find('thumb').text, +                'description': video.find('description').text, +                'uploader': config.find('blog/name').text, +                'uploader_id': video.find('identifier').text, +                'view_count': re.search(r'\d+', video.find('views').text).group(), +                } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ac32043c1..512e06e2a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,5 +1,6 @@  import json  import re +import itertools  from .common import InfoExtractor  from ..utils import ( @@ -19,18 +20,31 @@ class VimeoIE(InfoExtractor):      _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'      _NETRC_MACHINE = 'vimeo'      IE_NAME = u'vimeo' -    _TEST = { -        u'url': u'http://vimeo.com/56015672', -        u'file': u'56015672.mp4', -        u'md5': u'8879b6cc097e987f02484baf890129e5', -        u'info_dict': { -            u"upload_date": u"20121220",  -            u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",  -            u"uploader_id": u"user7108434",  -            u"uploader": u"Filippo Valsorda",  -            u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550" -        } -    } +    _TESTS = [ +        { +            u'url': u'http://vimeo.com/56015672', +            u'file': u'56015672.mp4', +            u'md5': u'8879b6cc097e987f02484baf890129e5', +            u'info_dict': { +                u"upload_date": u"20121220",  +                u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",  +                u"uploader_id": u"user7108434",  +                u"uploader": u"Filippo Valsorda",  +                u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", +            }, +        }, +        { +            u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', +            u'file': u'68093876.mp4', +            u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82', +            u'note': u'Vimeo Pro video (#1197)', +            u'info_dict': { +                u'uploader_id': u'openstreetmapus',  +                u'uploader': u'OpenStreetMap US',  +                u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', +            }, +        }, +    ]      def _login(self):          (username, password) = self._get_login_info() @@ -82,7 +96,9 @@ class VimeoIE(InfoExtractor):          video_id = mobj.group('id')          if not mobj.group('proto'):              url = 'https://' + url -        if mobj.group('direct_link') or mobj.group('pro'): +        elif mobj.group('pro'): +            url = 'http://player.vimeo.com/video/' + video_id +        elif mobj.group('direct_link'):              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information @@ -171,3 +187,31 @@ class VimeoIE(InfoExtractor):              'thumbnail':    video_thumbnail,              'description':  video_description,          }] + + +class VimeoChannelIE(InfoExtractor): +    IE_NAME = u'vimeo:channel' +    _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)' +    _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        channel_id =  mobj.group('id') +        video_ids = [] + +        for pagenum in itertools.count(1): +            webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum), +                                             channel_id, u'Downloading page %s' % pagenum) +            video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) +            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: +                break + +        entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') +                   for video_id in video_ids] +        channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id, +                                                webpage, u'channel title') +        return {'_type': 'playlist', +                'id': channel_id, +                'title': channel_title, +                'entries': entries, +                } diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 0d1302cd2..7d228edac 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -17,11 +17,12 @@ class WatIE(InfoExtractor):      _TEST = {          u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',          u'file': u'10631273.mp4', -        u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a', +        u'md5': u'd8b2231e1e333acd12aad94b80937e19',          u'info_dict': {              u'title': u'World War Z - Philadelphia VOST',              u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', -        } +        }, +        u'skip': u'Sometimes wat serves the whole file with the --test option',      }      def download_video_info(self, real_id): @@ -58,20 +59,8 @@ class WatIE(InfoExtractor):          # Otherwise we can continue and extract just one part, we have to use          # the short id for getting the video url -        player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id, -                                                     'html5': '1'}) -        player_info = self._download_webpage('http://www.wat.tv/player?' + player_data, -                                             real_id, u'Downloading player info') -        player = json.loads(player_info)['player'] -        html5_player = self._html_search_regex(r'iframe src="(.*?)"', player, -                                               'html5 player') -        player_webpage = self._download_webpage(html5_player, real_id, -                                                u'Downloading player webpage') - -        video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage, -                                       'video url')          info = {'id': real_id, -                'url': video_url, +                'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,                  'ext': 'mp4',                  'title': first_chapter['title'],                  'thumbnail': first_chapter['preview'], diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 5b9779c05..3237596a3 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -21,6 +21,13 @@ class WorldStarHipHopIE(InfoExtractor):          webpage_src = self._download_webpage(url, video_id) +        m_vevo_id = re.search(r'videoId=(.*?)&?', +            webpage_src) +         +        if m_vevo_id is not None: +            self.to_screen(u'Vevo video detected:') +            return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') +          video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',              webpage_src, u'video URL') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 414e33b49..f6ffb86c3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -105,14 +105,27 @@ class YoutubeIE(YoutubeSubtitlesIE):                       ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID                       (?(1).+)?                                                # if we found the ID, everything can follow                       $""" -    _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' -    _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' -    _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'      _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' -    _NETRC_MACHINE = 'youtube'      # Listed in order of quality -    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] -    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] +    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', +                          '95', '94', '93', '92', '132', '151', +                          # 3D +                          '85', '84', '102', '83', '101', '82', '100', +                          # Dash video +                          '138', '137', '248', '136', '247', '135', '246', +                          '245', '244', '134', '243', '133', '242', '160', +                          # Dash audio +                          '141', '172', '140', '171', '139', +                          ] +    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', +                                      '95', '94', '93', '92', '132', '151', +                                      '85', '102', '84', '101', '83', '100', '82', +                                      # Dash video +                                      '138', '248', '137', '247', '136', '246', '245', +                                      '244', '135', '243', '134', '242', '133', '160', +                                      # Dash audio +                                      '172', '141', '171', '140', '139', +                                      ]      _video_extensions = {          '13': '3gp',          '17': 'mp4', @@ -124,6 +137,47 @@ class YoutubeIE(YoutubeSubtitlesIE):          '44': 'webm',          '45': 'webm',          '46': 'webm', + +        # 3d videos +        '82': 'mp4', +        '83': 'mp4', +        '84': 'mp4', +        '85': 'mp4', +        '100': 'webm', +        '101': 'webm', +        '102': 'webm', + +        # videos that use m3u8 +        '92': 'mp4', +        '93': 'mp4', +        '94': 'mp4', +        '95': 'mp4', +        '96': 'mp4', +        '132': 'mp4', +        '151': 'mp4', + +        # Dash mp4 +        '133': 'mp4', +        '134': 'mp4', +        '135': 'mp4', +        '136': 'mp4', +        '137': 'mp4', +        '138': 'mp4', +        '139': 'mp4', +        '140': 'mp4', +        '141': 'mp4', +        '160': 'mp4', + +        # Dash webm +        '171': 'webm', +        '172': 'webm', +        '242': 'webm', +        '243': 'webm', +        '244': 'webm', +        '245': 'webm', +        '246': 'webm', +        '247': 'webm', +        '248': 'webm',      }      _video_dimensions = {          '5': '240x400', @@ -140,7 +194,69 @@ class YoutubeIE(YoutubeSubtitlesIE):          '44': '480x854',          '45': '720x1280',          '46': '1080x1920', +        '82': '360p', +        '83': '480p', +        '84': '720p', +        '85': '1080p', +        '92': '240p', +        '93': '360p', +        '94': '480p', +        '95': '720p', +        '96': '1080p', +        '100': '360p', +        '101': '480p', +        '102': '720p', +        '132': '240p', +        '151': '72p', +        '133': '240p', +        '134': '360p', +        '135': '480p', +        '136': '720p', +        '137': '1080p', +        '138': '>1080p', +        '139': '48k', +        '140': '128k', +        '141': '256k', +        '160': '192p', +        '171': '128k', +        '172': '256k', +        '242': '240p', +        '243': '360p', +        '244': '480p', +        '245': '480p', +        '246': '480p', +        '247': '720p', +        '248': '1080p', +    } +    _special_itags = { +        '82': '3D', +        '83': '3D', +        '84': '3D', +        '85': '3D', +        '100': '3D', +        '101': '3D', +        '102': '3D', +        '133': 'DASH Video', +        '134': 'DASH Video', +        '135': 'DASH Video', +        '136': 'DASH Video', +        '137': 'DASH Video', +        '138': 'DASH Video', +        '139': 'DASH Audio', +        '140': 'DASH Audio', +        '141': 'DASH Audio', +        '160': 'DASH Video', +        '171': 'DASH Audio', +        '172': 'DASH Audio', +        '242': 'DASH Video', +        '243': 'DASH Video', +        '244': 'DASH Video', +        '245': 'DASH Video', +        '246': 'DASH Video', +        '247': 'DASH Video', +        '248': 'DASH Video',      } +      IE_NAME = u'youtube'      _TESTS = [          { @@ -174,7 +290,7 @@ class YoutubeIE(YoutubeSubtitlesIE):                  u"upload_date": u"20120506",                  u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",                  u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", -                u"uploader": u"IconaPop", +                u"uploader": u"Icona Pop",                  u"uploader_id": u"IconaPop"              }          }, @@ -190,6 +306,21 @@ class YoutubeIE(YoutubeSubtitlesIE):                  u"uploader_id": u"justintimberlakeVEVO"              }          }, +        { +            u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE', +            u'file': u'TGi3HqYrWHE.mp4', +            u'note': u'm3u8 video', +            u'info_dict': { +                u'title': u'Triathlon - Men - London 2012 Olympic Games', +                u'description': u'- Men -  TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games', +                u'uploader': u'olympic', +                u'upload_date': u'20120807', +                u'uploader_id': u'olympic', +            }, +            u'params': { +                u'skip_download': True, +            }, +        },      ] @@ -199,10 +330,6 @@ class YoutubeIE(YoutubeSubtitlesIE):          if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None -    def report_lang(self): -        """Report attempt to set language.""" -        self.to_screen(u'Setting language') -      def report_video_webpage_download(self, video_id):          """Report attempt to download video webpage."""          self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -230,115 +357,144 @@ class YoutubeIE(YoutubeSubtitlesIE):              return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]          elif len(s) == 90:              return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] +        elif len(s) == 89: +            return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]          elif len(s) == 88:              return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]          elif len(s) == 87: -            return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1] +            return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]          elif len(s) == 86: -            return s[2:63] + s[82] + s[64:82] + s[63] +            return s[5:20] + s[2] + s[21:]          elif len(s) == 85: -            return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21] +            return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27]          elif len(s) == 84: -            return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26] +            return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27]          elif len(s) == 83: -            return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[53] + s[34:53] + s[24] + s[54:] +            return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]          elif len(s) == 82:              return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]          elif len(s) == 81: -            return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[2] + s[34:53] + s[24] + s[54:81] +            return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] +        elif len(s) == 79: +            return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]          else:              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) -    def _print_formats(self, formats): -        print('Available formats:') -        for x in formats: -            print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) +    def _decrypt_signature_age_gate(self, s): +        # The videos with age protection use another player, so the algorithms +        # can be different. +        if len(s) == 86: +            return s[2:63] + s[82] + s[64:82] + s[63] +        else: +            # Fallback to the other algortihms +            return self._decrypt_signature(s) -    def _real_initialize(self): -        if self._downloader is None: -            return -        # Set language -        request = compat_urllib_request.Request(self._LANG_URL) +    def _get_available_subtitles(self, video_id): +        self.report_video_subtitles_download(video_id) +        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)          try: -            self.report_lang() -            compat_urllib_request.urlopen(request).read() +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) -            return - -        (username, password) = self._get_login_info() - -        # No authentication to be performed -        if username is None: -            return +            return (u'unable to download video subtitles: %s' % compat_str(err), None) +        sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) +        sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) +        if not sub_lang_list: +            return (u'video doesn\'t have subtitles', None) +        return sub_lang_list -        request = compat_urllib_request.Request(self._LOGIN_URL) +    def _list_available_subtitles(self, video_id): +        sub_lang_list = self._get_available_subtitles(video_id) +        self.report_video_subtitles_available(video_id, sub_lang_list) + +    def _request_subtitle(self, sub_lang, sub_name, video_id, format): +        """ +        Return tuple: +        (error_message, sub_lang, sub) +        """ +        self.report_video_subtitles_request(video_id, sub_lang, format) +        params = compat_urllib_parse.urlencode({ +            'lang': sub_lang, +            'name': sub_name, +            'v': video_id, +            'fmt': format, +        }) +        url = 'http://www.youtube.com/api/timedtext?' + params          try: -            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') +            sub = compat_urllib_request.urlopen(url).read().decode('utf-8')          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) -            return +            return (u'unable to download video subtitles: %s' % compat_str(err), None, None) +        if not sub: +            return (u'Did not fetch video subtitles', None, None) +        return (None, sub_lang, sub) -        galx = None -        dsh = None -        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page) -        if match: -          galx = match.group(1) - -        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page) -        if match: -          dsh = match.group(1) - -        # Log in -        login_form_strs = { -                u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', -                u'Email': username, -                u'GALX': galx, -                u'Passwd': password, -                u'PersistentCookie': u'yes', -                u'_utf8': u'霱', -                u'bgresponse': u'js_disabled', -                u'checkConnection': u'', -                u'checkedDomains': u'youtube', -                u'dnConn': u'', -                u'dsh': dsh, -                u'pstMsg': u'0', -                u'rmShown': u'1', -                u'secTok': u'', -                u'signIn': u'Sign in', -                u'timeStmp': u'', -                u'service': u'youtube', -                u'uilel': u'3', -                u'hl': u'en_US', -        } -        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode -        # chokes on unicode -        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) -        login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') -        request = compat_urllib_request.Request(self._LOGIN_URL, login_data) +    def _request_automatic_caption(self, video_id, webpage): +        """We need the webpage for getting the captions url, pass it as an +           argument to speed up the process.""" +        sub_lang = self._downloader.params.get('subtitleslang') or 'en' +        sub_format = self._downloader.params.get('subtitlesformat') +        self.to_screen(u'%s: Looking for automatic captions' % video_id) +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang +        if mobj is None: +            return [(err_msg, None, None)] +        player_config = json.loads(mobj.group(1))          try: -            self.report_login() -            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') -            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: -                self._downloader.report_warning(u'unable to log in: bad username or password') -                return -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) -            return +            args = player_config[u'args'] +            caption_url = args[u'ttsurl'] +            timestamp = args[u'timestamp'] +            params = compat_urllib_parse.urlencode({ +                'lang': 'en', +                'tlang': sub_lang, +                'fmt': sub_format, +                'ts': timestamp, +                'kind': 'asr', +            }) +            subtitles_url = caption_url + '&' + params +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') +            return [(None, sub_lang, sub)] +        except KeyError: +            return [(err_msg, None, None)] + +    def _extract_subtitle(self, video_id): +        """ +        Return a list with a tuple: +        [(error_message, sub_lang, sub)] +        """ +        sub_lang_list = self._get_available_subtitles(video_id) +        sub_format = self._downloader.params.get('subtitlesformat') +        if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles +            return [(sub_lang_list[0], None, None)] +        if self._downloader.params.get('subtitleslang', False): +            sub_lang = self._downloader.params.get('subtitleslang') +        elif 'en' in sub_lang_list: +            sub_lang = 'en' +        else: +            sub_lang = list(sub_lang_list.keys())[0] +        if not sub_lang in sub_lang_list: +            return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] -        # Confirm age -        age_form = { -                'next_url':     '/', -                'action_confirm':   'Confirm', -                } -        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) -        try: -            self.report_age_confirmation() -            compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) +        subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) +        return [subtitle] + +    def _extract_all_subtitles(self, video_id): +        sub_lang_list = self._get_available_subtitles(video_id) +        sub_format = self._downloader.params.get('subtitlesformat') +        if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles +            return [(sub_lang_list[0], None, None)] +        subtitles = [] +        for sub_lang in sub_lang_list: +            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) +            subtitles.append(subtitle) +        return subtitles + +    def _print_formats(self, formats): +        print('Available formats:') +        for x in formats: +            print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'), +                                        self._video_dimensions.get(x, '???'), +                                        ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))      def _extract_id(self, url):          mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -347,6 +503,57 @@ class YoutubeIE(YoutubeSubtitlesIE):          video_id = mobj.group(2)          return video_id +    def _get_video_url_list(self, url_map): +        """ +        Transform a dictionary in the format {itag:url} to a list of (itag, url) +        with the requested formats. +        """ +        req_format = self._downloader.params.get('format', None) +        format_limit = self._downloader.params.get('format_limit', None) +        available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats +        if format_limit is not None and format_limit in available_formats: +            format_list = available_formats[available_formats.index(format_limit):] +        else: +            format_list = available_formats +        existing_formats = [x for x in format_list if x in url_map] +        if len(existing_formats) == 0: +            raise ExtractorError(u'no known formats available for video') +        if self._downloader.params.get('listformats', None): +            self._print_formats(existing_formats) +            return +        if req_format is None or req_format == 'best': +            video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality +        elif req_format == 'worst': +            video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality +        elif req_format in ('-1', 'all'): +            video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats +        else: +            # Specific formats. We pick the first in a slash-delimeted sequence. +            # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. +            req_formats = req_format.split('/') +            video_url_list = None +            for rf in req_formats: +                if rf in url_map: +                    video_url_list = [(rf, url_map[rf])] +                    break +            if video_url_list is None: +                raise ExtractorError(u'requested format not available') +        return video_url_list + +    def _extract_from_m3u8(self, manifest_url, video_id): +        url_map = {} +        def _get_urls(_manifest): +            lines = _manifest.split('\n') +            urls = filter(lambda l: l and not l.startswith('#'), +                            lines) +            return urls +        manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest') +        formats_urls = _get_urls(manifest) +        for format_url in formats_urls: +            itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') +            url_map[itag] = format_url +        return url_map +      def _real_extract(self, url):          if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):              self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like  youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply  youtube-dl BaW_jenozKc  ).') @@ -486,7 +693,6 @@ class YoutubeIE(YoutubeSubtitlesIE):              video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])          # Decide which formats to download -        req_format = self._downloader.params.get('format', None)          try:              mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) @@ -500,6 +706,17 @@ class YoutubeIE(YoutubeSubtitlesIE):              if m_s is not None:                  self.to_screen(u'%s: Encrypted signatures detected.' % video_id)                  video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] +            m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u'')) +            if m_s is not None: +                if 'url_encoded_fmt_stream_map' in video_info: +                    video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts'] +                else: +                    video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']] +            elif 'adaptive_fmts' in video_info: +                if 'url_encoded_fmt_stream_map' in video_info: +                    video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0] +                else: +                    video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']          except ValueError:              pass @@ -521,8 +738,8 @@ class YoutubeIE(YoutubeSubtitlesIE):                              s = url_data['s'][0]                              if age_gate:                                  player_version = self._search_regex(r'ad3-(.+?)\.swf', -                                    video_info['ad3_module'][0], 'flash player', -                                    fatal=False) +                                    video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND', +                                    'flash player', fatal=False)                                  player = 'flash player %s' % player_version                              else:                                  player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, @@ -530,41 +747,25 @@ class YoutubeIE(YoutubeSubtitlesIE):                              parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))                              self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %                                  (len(s), parts_sizes, url_data['itag'][0], player)) -                        signature = self._decrypt_signature(url_data['s'][0]) +                        encrypted_sig = url_data['s'][0] +                        if age_gate: +                            signature = self._decrypt_signature_age_gate(encrypted_sig) +                        else: +                            signature = self._decrypt_signature(encrypted_sig)                          url += '&signature=' + signature                      if 'ratebypass' not in url:                          url += '&ratebypass=yes'                      url_map[url_data['itag'][0]] = url - -            format_limit = self._downloader.params.get('format_limit', None) -            available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats -            if format_limit is not None and format_limit in available_formats: -                format_list = available_formats[available_formats.index(format_limit):] -            else: -                format_list = available_formats -            existing_formats = [x for x in format_list if x in url_map] -            if len(existing_formats) == 0: -                raise ExtractorError(u'no known formats available for video') -            if self._downloader.params.get('listformats', None): -                self._print_formats(existing_formats) +            video_url_list = self._get_video_url_list(url_map) +            if not video_url_list:                  return -            if req_format is None or req_format == 'best': -                video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality -            elif req_format == 'worst': -                video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality -            elif req_format in ('-1', 'all'): -                video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats -            else: -                # Specific formats. We pick the first in a slash-delimeted sequence. -                # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. -                req_formats = req_format.split('/') -                video_url_list = None -                for rf in req_formats: -                    if rf in url_map: -                        video_url_list = [(rf, url_map[rf])] -                        break -                if video_url_list is None: -                    raise ExtractorError(u'requested format not available') +        elif video_info.get('hlsvp'): +            manifest_url = video_info['hlsvp'][0] +            url_map = self._extract_from_m3u8(manifest_url, video_id) +            video_url_list = self._get_video_url_list(url_map) +            if not video_url_list: +                return +          else:              raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') @@ -573,8 +774,9 @@ class YoutubeIE(YoutubeSubtitlesIE):              # Extension              video_extension = self._video_extensions.get(format_param, 'flv') -            video_format = '{0} - {1}'.format(format_param if format_param else video_extension, -                                              self._video_dimensions.get(format_param, '???')) +            video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension, +                                              self._video_dimensions.get(format_param, '???'), +                                              ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')              results.append({                  'id':       video_id, @@ -604,10 +806,10 @@ class YoutubePlaylistIE(InfoExtractor):                             \? (?:.*?&)*? (?:p|a|list)=                          |  p/                          ) -                        ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) +                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})                          .*                       | -                        ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) +                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})                       )"""      _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'      _MAX_RESULTS = 50 @@ -626,11 +828,14 @@ class YoutubePlaylistIE(InfoExtractor):          # Download playlist videos from API          playlist_id = mobj.group(1) or mobj.group(2) -        page_num = 1          videos = [] -        while True: -            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) +        for page_num in itertools.count(1): +            start_index = self._MAX_RESULTS * (page_num - 1) + 1 +            if start_index >= 1000: +                self._downloader.report_warning(u'Max number of results reached') +                break +            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)              page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)              try: @@ -650,10 +855,6 @@ class YoutubePlaylistIE(InfoExtractor):                  if 'media$group' in entry and 'media$player' in entry['media$group']:                      videos.append((index, entry['media$group']['media$player']['url'])) -            if len(response['feed']['entry']) < self._MAX_RESULTS: -                break -            page_num += 1 -          videos = [v[1] for v in sorted(videos)]          url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] @@ -665,7 +866,7 @@ class YoutubeChannelIE(InfoExtractor):      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"      _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'      _MORE_PAGES_INDICATOR = 'yt-uix-load-more' -    _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' +    _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'      IE_NAME = u'youtube:channel'      def extract_videos_from_page(self, page): @@ -696,9 +897,7 @@ class YoutubeChannelIE(InfoExtractor):          # Download any subsequent channel pages using the json-based channel_ajax query          if self._MORE_PAGES_INDICATOR in page: -            while True: -                pagenum = pagenum + 1 - +            for pagenum in itertools.count(1):                  url = self._MORE_PAGES_URL % (pagenum, channel_id)                  page = self._download_webpage(url, channel_id,                                                u'Downloading page #%s' % pagenum) @@ -741,9 +940,8 @@ class YoutubeUserIE(InfoExtractor):          # all of them.          video_ids = [] -        pagenum = 0 -        while True: +        for pagenum in itertools.count(0):              start_index = pagenum * self._GDATA_PAGE_SIZE + 1              gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) @@ -768,8 +966,6 @@ class YoutubeUserIE(InfoExtractor):              if len(ids_in_page) < self._GDATA_PAGE_SIZE:                  break -            pagenum += 1 -          urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]          url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]          return [self.playlist_result(url_results, playlist_title = username)] @@ -832,38 +1028,75 @@ class YoutubeShowIE(InfoExtractor):          return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] -class YoutubeSubscriptionsIE(YoutubeIE): -    """It's a subclass of YoutubeIE because we need to login""" -    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' -    IE_NAME = u'youtube:subscriptions' -    _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s' +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +    """ +    Base class for extractors that fetch info from +    http://www.youtube.com/feed_ajax +    Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. +    """ +    _LOGIN_REQUIRED = True      _PAGING_STEP = 30 +    # use action_load_personal_feed instead of action_load_system_feed +    _PERSONAL_FEED = False -    # Overwrite YoutubeIE properties we don't want -    _TESTS = [] -    @classmethod -    def suitable(cls, url): -        return re.match(cls._VALID_URL, url) is not None +    @property +    def _FEED_TEMPLATE(self): +        action = 'action_load_system_feed' +        if self._PERSONAL_FEED: +            action = 'action_load_personal_feed' +        return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) + +    @property +    def IE_NAME(self): +        return u'youtube:%s' % self._FEED_NAME      def _real_initialize(self): -        (username, password) = self._get_login_info() -        if username is None: -            raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True) -        super(YoutubeSubscriptionsIE, self)._real_initialize() +        self._login()      def _real_extract(self, url):          feed_entries = []          # The step argument is available only in 2.7 or higher          for i in itertools.count(0):              paging = i*self._PAGING_STEP -            info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed', +            info = self._download_webpage(self._FEED_TEMPLATE % paging, +                                          u'%s feed' % self._FEED_NAME,                                            u'Downloading page %s' % i)              info = json.loads(info)              feed_html = info['feed_html'] -            m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html) +            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)              ids = orderedSet(m.group(1) for m in m_ids)              feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)              if info['paging'] is None:                  break -        return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions') +        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): +    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' +    _FEED_NAME = 'subscriptions' +    _PLAYLIST_TITLE = u'Youtube Subscriptions' + +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): +    IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' +    _FEED_NAME = 'recommended' +    _PLAYLIST_TITLE = u'Youtube Recommended videos' + +class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +    IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' +    _FEED_NAME = 'watch_later' +    _PLAYLIST_TITLE = u'Youtube Watch Later' +    _PAGING_STEP = 100 +    _PERSONAL_FEED = True + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): +    IE_NAME = u'youtube:favorites' +    IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' +    _LOGIN_REQUIRED = True + +    def _real_extract(self, url): +        webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') +        playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id') +        return self.url_result(playlist_id, 'YoutubePlaylist') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cf2ea654e..5dd5b2923 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -207,7 +207,7 @@ if sys.version_info >= (2,7):      def find_xpath_attr(node, xpath, key, val):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z]+$', key) -        assert re.match(r'^[a-zA-Z@]*$', val) +        assert re.match(r'^[a-zA-Z@\s]*$', val)          expr = xpath + u"[@%s='%s']" % (key, val)          return node.find(expr)  else: @@ -497,7 +497,7 @@ class ExtractorError(Exception):          if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):              expected = True          if not expected: -            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.' +            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'          super(ExtractorError, self).__init__(msg)          self.traceback = tb diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 32eb27dad..58e26bc49 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.19' +__version__ = '2013.08.21' | 
