diff options
| -rw-r--r-- | test/test_all_urls.py | 1 | ||||
| -rw-r--r-- | test/test_dailymotion_subtitles.py | 2 | ||||
| -rw-r--r-- | test/test_playlists.py | 10 | ||||
| -rw-r--r-- | test/test_youtube_subtitles.py | 2 | ||||
| -rw-r--r-- | youtube_dl/YoutubeDL.py | 4 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/archiveorg.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/dreisat.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/googleplus.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/mixcloud.py | 122 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 45 | ||||
| -rw-r--r-- | youtube_dl/extractor/subtitles.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/trilulilu.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/xhamster.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 3 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 11 | 
17 files changed, 130 insertions, 106 deletions
| diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 99fc7bd28..ff1c86efe 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668          self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])          self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) +        self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])      def test_youtube_channel_matching(self):          assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index bcd9f79f6..83c65d57e 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')      def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 5) @@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase):          self.assertTrue(len(subtitles.keys()) == 0)      def test_nosubtitles(self):          self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) diff --git a/test/test_playlists.py b/test/test_playlists.py index 4a2e00b01..d079a4f23 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,7 +8,7 @@ import json  import os  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE  from youtube_dl.utils import *  from helper import FakeYDL @@ -42,5 +42,13 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], u'5124905')          self.assertTrue(len(result['entries']) >= 11) +    def test_soundcloud_user(self): +        dl = FakeYDL() +        ie = SoundcloudUserIE(dl) +        result = ie.extract('https://soundcloud.com/the-concept-band') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], u'9615865') +        self.assertTrue(len(result['entries']) >= 12) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 5632871ac..168e6c66c 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')      def test_youtube_allsubtitles(self): +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 13) @@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          self.assertTrue(subtitles['it'] is not None)      def test_youtube_nosubtitles(self):          self.url = 'sAjKT8FhjI8' +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c2f992b8e..e53a2b8ad 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -74,6 +74,7 @@ class YoutubeDL(object):      writesubtitles:    Write the video subtitles to a file      writeautomaticsub: Write the automatic subtitles to a file      allsubtitles:      Downloads all the subtitles of the video +                       (requires writesubtitles or writeautomaticsub)      listsubtitles:     Lists all available subtitles for the video      subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)      subtitleslangs:    List of languages of the subtitles to download @@ -499,8 +500,7 @@ class YoutubeDL(object):                  return          subtitles_are_requested = any([self.params.get('writesubtitles', False), -                                       self.params.get('writeautomaticsub'), -                                       self.params.get('allsubtitles', False)]) +                                       self.params.get('writeautomaticsub')])          if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:              # subtitles download errors are already managed as troubles in relevant IE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 696e54f49..0022a4e7a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -533,6 +533,11 @@ def _real_main(argv=None):      else:          date = DateRange(opts.dateafter, opts.datebefore) +    # --all-sub automatically sets --write-sub if --write-auto-sub is not given +    # this was the old behaviour if only --all-sub was given. +    if opts.allsubtitles and (opts.writeautomaticsub == False): +        opts.writesubtitles = True +      if sys.version_info < (3,):          # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)          if opts.outtmpl is not None: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f9542d2..19d57c2e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -82,7 +82,7 @@ from .sina import SinaIE  from .slashdot import SlashdotIE  from .slideshare import SlideshareIE  from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE +from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE  from .statigram import StatigramIE diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 7efd1d823..61ce4469a 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor):              for fn,fdata in data['files'].items()              if 'Video' in fdata['format']]          formats.sort(key=lambda fdata: fdata['file_size']) +        for f in formats: +            f['ext'] = determine_ext(f['url'])          info = {              '_type': 'video', @@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor):              info['thumbnail'] = thumbnail          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = determine_ext(formats[-1]['url']) +        info.update(formats[-1]) -        return info
\ No newline at end of file +        return info diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 64b465805..765cb1f37 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor):              'width': int(fe.find('./width').text),              'height': int(fe.find('./height').text),              'url': fe.find('./url').text, +            'ext': determine_ext(fe.find('./url').text),              'filesize': int(fe.find('./filesize').text),              'video_bitrate': int(fe.find('./videoBitrate').text),              '3sat_qualityname': fe.find('./quality').text, @@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor):          }          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = determine_ext(formats[-1]['url']) +        info.update(formats[-1]) -        return info
\ No newline at end of file +        return info diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index f1cd88983..8895ad289 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor):          self.report_extraction(video_id)          # Extract update date -        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', +        upload_date = self._html_search_regex( +            ['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'],              webpage, u'upload date', fatal=False)          if upload_date:              # Convert timestring to a format suitable for filename diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 8245b5583..a200dcd74 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,34 +5,27 @@ import socket  from .common import InfoExtractor  from ..utils import (      compat_http_client, -    compat_str,      compat_urllib_error,      compat_urllib_request, - -    ExtractorError, +    unified_strdate,  )  class MixcloudIE(InfoExtractor): -    _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/      _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'      IE_NAME = u'mixcloud' -    def report_download_json(self, file_id): -        """Report JSON download.""" -        self.to_screen(u'Downloading json') - -    def get_urls(self, jsonData, fmt, bitrate='best'): -        """Get urls from 'audio_formats' section in json""" -        try: -            bitrate_list = jsonData[fmt] -            if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: -                bitrate = max(bitrate_list) # select highest - -            url_list = jsonData[fmt][bitrate] -        except TypeError: # we have no bitrate info. -            url_list = jsonData[fmt] -        return url_list +    _TEST = { +        u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', +        u'file': u'dholbach-cryptkeeper.mp3', +        u'info_dict': { +            u'title': u'Cryptkeeper', +            u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', +            u'uploader': u'Daniel Holbach', +            u'uploader_id': u'dholbach', +            u'upload_date': u'20111115', +        }, +    }      def check_urls(self, url_list):          """Returns 1st active url from list""" @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor):          return None -    def _print_formats(self, formats): -        print('Available formats:') -        for fmt in formats.keys(): -            for b in formats[fmt]: -                try: -                    ext = formats[fmt][b][0] -                    print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) -                except TypeError: # we have no bitrate info -                    ext = formats[fmt][0] -                    print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) -                    break -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) -        # extract uploader & filename from url -        uploader = mobj.group(1).decode('utf-8') -        file_id = uploader + "-" + mobj.group(2).decode('utf-8') - -        # construct API request -        file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' -        # retrieve .json file with links to files -        request = compat_urllib_request.Request(file_url) -        try: -            self.report_download_json(file_url) -            jsonData = compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) - -        # parse JSON -        json_data = json.loads(jsonData) -        player_url = json_data['player_swf_url'] -        formats = dict(json_data['audio_formats']) - -        req_format = self._downloader.params.get('format', None) - -        if self._downloader.params.get('listformats', None): -            self._print_formats(formats) -            return - -        if req_format is None or req_format == 'best': -            for format_param in formats.keys(): -                url_list = self.get_urls(formats, format_param) -                # check urls -                file_url = self.check_urls(url_list) -                if file_url is not None: -                    break # got it! -        else: -            if req_format not in formats: -                raise ExtractorError(u'Format is not available') - -            url_list = self.get_urls(formats, req_format) -            file_url = self.check_urls(url_list) -            format_param = req_format -        return [{ -            'id': file_id.decode('utf-8'), -            'url': file_url.decode('utf-8'), -            'uploader': uploader.decode('utf-8'), -            'upload_date': None, -            'title': json_data['name'], -            'ext': file_url.split('.')[-1].decode('utf-8'), -            'format': (format_param is None and u'NA' or format_param.decode('utf-8')), -            'thumbnail': json_data['thumbnail_url'], -            'description': json_data['description'], -            'player_url': player_url.decode('utf-8'), -        }] +        uploader = mobj.group(1) +        cloudcast_name = mobj.group(2) +        track_id = '-'.join((uploader, cloudcast_name)) +        api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) +        webpage = self._download_webpage(url, track_id) +        json_data = self._download_webpage(api_url, track_id, +            u'Downloading cloudcast info') +        info = json.loads(json_data) + +        preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') +        song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') +        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) +        final_song_url = self.check_urls(template_url % i for i in range(30)) + +        return { +            'id': track_id, +            'title': info['name'], +            'url': final_song_url, +            'ext': 'mp3', +            'description': info['description'], +            'thumbnail': info['pictures'].get('extra_large'), +            'uploader': info['user']['name'], +            'uploader_id': info['user']['username'], +            'upload_date': unified_strdate(info['created_time']), +            'view_count': info['play_count'], +        } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5f3a5540d..29cd5617c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,10 +1,12 @@  import json  import re +import itertools  from .common import InfoExtractor  from ..utils import (      compat_str,      compat_urlparse, +    compat_urllib_parse,      ExtractorError,      unified_strdate, @@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor):      def _resolv_url(cls, url):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID -    def _extract_info_dict(self, info, full_title=None): +    def _extract_info_dict(self, info, full_title=None, quiet=False):          video_id = info['id']          name = full_title or video_id -        self.report_extraction(name) +        if quiet == False: +            self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None: @@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE):                  'id': info['id'],                  'title': info['title'],                  } + + +class SoundcloudUserIE(SoundcloudIE): +    _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' +    IE_NAME = u'soundcloud:user' + +    # it's in tests/test_playlists.py +    _TEST = None + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        uploader = mobj.group('user') + +        url = 'http://soundcloud.com/%s/' % uploader +        resolv_url = self._resolv_url(url) +        user_json = self._download_webpage(resolv_url, uploader, +            u'Downloading user info') +        user = json.loads(user_json) + +        tracks = [] +        for i in itertools.count(): +            data = compat_urllib_parse.urlencode({'offset': i*50, +                                                  'client_id': self._CLIENT_ID, +                                                  }) +            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data +            response = self._download_webpage(tracks_url, uploader,  +                u'Downloading tracks page %s' % (i+1)) +            new_tracks = json.loads(response) +            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks) +            if len(new_tracks) < 50: +                break + +        return { +            '_type': 'playlist', +            'id': compat_str(user['id']), +            'title': user['username'], +            'entries': tracks, +        } diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 97215f289..90de7de3a 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor):      @property      def _have_to_download_any_subtitles(self):          return any([self._downloader.params.get('writesubtitles', False), -                    self._downloader.params.get('writeautomaticsub'), -                    self._downloader.params.get('allsubtitles', False)]) +                    self._downloader.params.get('writeautomaticsub')])      def _list_available_subtitles(self, video_id, webpage=None):          """ outputs the available subtitles for the video """ @@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor):          available_subs_list = {}          if self._downloader.params.get('writeautomaticsub', False):              available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) -        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): +        if self._downloader.params.get('writesubtitles', False):              available_subs_list.update(self._get_available_subtitles(video_id))          if not available_subs_list:  # error, it didn't get the available subtitles diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index f278951ba..0bf028f61 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor):              {                  'format': fnode.text,                  'url': video_url_template % fnode.text, +                'ext': fnode.text.partition('-')[0]              }              for fnode in format_doc.findall('./formats/format') @@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor):          }          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = formats[-1]['format'].partition('-')[0] +        info.update(formats[-1])          return info diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index e50069586..fa759d30c 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,7 +11,7 @@ from ..utils import (  class XHamsterIE(InfoExtractor):      """Information Extractor for xHamster""" -    _VALID_URL = r'(?:http://)?(?P<url>(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html(?:\?.*)?)' +    _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'      _TEST = {          u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',          u'file': u'1509445.flv', @@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        mrss_url = 'http://' +  mobj.group('url') +        mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id          webpage = self._download_webpage(mrss_url, video_id)          mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f49665925..e4a2e22bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                       (                           (?:https?://)?                                       # http(s):// (optional)                           (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| -                            tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains +                            tube\.majestyc\.net/| +                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls                           (?:                                                  # the various things that can precede the ID:                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/ diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 768c6207d..5558d4737 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -700,7 +700,16 @@ def unified_strdate(date_str):      date_str = date_str.replace(',',' ')      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) -    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] +    format_expressions = [ +        '%d %B %Y', +        '%B %d %Y', +        '%b %d %Y', +        '%Y-%m-%d', +        '%d/%m/%Y', +        '%Y/%m/%d %H:%M:%S', +        '%d.%m.%Y %H:%M', +        '%Y-%m-%dT%H:%M:%SZ', +    ]      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') | 
