diff options
25 files changed, 325 insertions, 133 deletions
diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 6e3595366..b390c7e2e 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -32,9 +32,9 @@ tests = [      # 83      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",       ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), -    # 82 - vflZK4ZYR 2013/08/23 +    # 82 - vflGNjMhJ 2013/09/12      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", -     "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"), +     ".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"),      # 81 - vflLC8JvQ 2013/07/25      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",       "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 99fc7bd28..ff1c86efe 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668          self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])          self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) +        self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])      def test_youtube_channel_matching(self):          assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index bcd9f79f6..83c65d57e 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')      def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 5) @@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase):          self.assertTrue(len(subtitles.keys()) == 0)      def test_nosubtitles(self):          self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) diff --git a/test/test_playlists.py b/test/test_playlists.py index 65de3a55c..d079a4f23 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,7 +8,7 @@ import json  import os  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE  from youtube_dl.utils import *  from helper import FakeYDL @@ -34,5 +34,21 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['title'], u'Vimeo Tributes')          self.assertTrue(len(result['entries']) > 24) +    def test_ustream_channel(self): +        dl = FakeYDL() +        ie = UstreamChannelIE(dl) +        result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], u'5124905') +        self.assertTrue(len(result['entries']) >= 11) + +    def test_soundcloud_user(self): +        dl = FakeYDL() +        ie = SoundcloudUserIE(dl) +        result = ie.extract('https://soundcloud.com/the-concept-band') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], u'9615865') +        self.assertTrue(len(result['entries']) >= 12) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index be1069105..ff2e9885b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,13 +11,16 @@ import os  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  #from youtube_dl.utils import htmlentity_transform -from youtube_dl.utils import timeconvert -from youtube_dl.utils import sanitize_filename -from youtube_dl.utils import unescapeHTML -from youtube_dl.utils import orderedSet -from youtube_dl.utils import DateRange -from youtube_dl.utils import unified_strdate -from youtube_dl.utils import find_xpath_attr +from youtube_dl.utils import ( +    timeconvert, +    sanitize_filename, +    unescapeHTML, +    orderedSet, +    DateRange, +    unified_strdate, +    find_xpath_attr, +    get_meta_content, +)  if sys.version_info < (3, 0):      _compat_str = lambda b: b.decode('unicode-escape') @@ -127,5 +130,16 @@ class TestUtil(unittest.TestCase):          self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])          self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) +    def test_meta_parser(self): +        testhtml = u''' +        <head> +            <meta name="description" content="foo & bar"> +            <meta content='Plato' name='author'/> +        </head> +        ''' +        get_meta = lambda name: get_meta_content(name, testhtml) +        self.assertEqual(get_meta('description'), u'foo & bar') +        self.assertEqual(get_meta('author'), 'Plato') +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 5632871ac..168e6c66c 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')      def test_youtube_allsubtitles(self): +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 13) @@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          self.assertTrue(subtitles['it'] is not None)      def test_youtube_nosubtitles(self):          self.url = 'sAjKT8FhjI8' +        self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b289bd9e2..e53a2b8ad 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -74,6 +74,7 @@ class YoutubeDL(object):      writesubtitles:    Write the video subtitles to a file      writeautomaticsub: Write the automatic subtitles to a file      allsubtitles:      Downloads all the subtitles of the video +                       (requires writesubtitles or writeautomaticsub)      listsubtitles:     Lists all available subtitles for the video      subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)      subtitleslangs:    List of languages of the subtitles to download @@ -492,13 +493,14 @@ class YoutubeDL(object):                  self.report_writedescription(descfn)                  with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:                      descfile.write(info_dict['description']) +            except (KeyError, TypeError): +                self.report_warning(u'There\'s no description to write.')              except (OSError, IOError):                  self.report_error(u'Cannot write description file ' + descfn)                  return          subtitles_are_requested = any([self.params.get('writesubtitles', False), -                                       self.params.get('writeautomaticsub'), -                                       self.params.get('allsubtitles', False)]) +                                       self.params.get('writeautomaticsub')])          if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:              # subtitles download errors are already managed as troubles in relevant IE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 696e54f49..0022a4e7a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -533,6 +533,11 @@ def _real_main(argv=None):      else:          date = DateRange(opts.dateafter, opts.datebefore) +    # --all-sub automatically sets --write-sub if --write-auto-sub is not given +    # this was the old behaviour if only --all-sub was given. +    if opts.allsubtitles and (opts.writeautomaticsub == False): +        opts.writesubtitles = True +      if sys.version_info < (3,):          # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)          if opts.outtmpl is not None: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 26cf24935..246f1e8b5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -52,6 +52,7 @@ from .jeuxvideo import JeuxVideoIE  from .jukebox import JukeboxIE  from .justintv import JustinTVIE  from .kankan import KankanIE +from .kickstarter import KickStarterIE  from .keek import KeekIE  from .liveleak import LiveLeakIE  from .livestream import LivestreamIE @@ -81,7 +82,8 @@ from .sina import SinaIE  from .slashdot import SlashdotIE  from .slideshare import SlideshareIE  from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE +from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .southparkstudios import SouthParkStudiosIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE  from .statigram import StatigramIE @@ -96,7 +98,7 @@ from .tudou import TudouIE  from .tumblr import TumblrIE  from .tutv import TutvIE  from .unistra import UnistraIE -from .ustream import UstreamIE +from .ustream import UstreamIE, UstreamChannelIE  from .vbox7 import Vbox7IE  from .veehd import VeeHDIE  from .veoh import VeohIE diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 7efd1d823..61ce4469a 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor):              for fn,fdata in data['files'].items()              if 'Video' in fdata['format']]          formats.sort(key=lambda fdata: fdata['file_size']) +        for f in formats: +            f['ext'] = determine_ext(f['url'])          info = {              '_type': 'video', @@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor):              info['thumbnail'] = thumbnail          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = determine_ext(formats[-1]['url']) +        info.update(formats[-1]) -        return info
\ No newline at end of file +        return info diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1f02519a0..1db9b24cf 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,3 +1,4 @@ +# encoding: utf-8  import re  import xml.etree.ElementTree @@ -5,24 +6,29 @@ from .common import InfoExtractor  from ..utils import unified_strdate  class CanalplusIE(InfoExtractor): -    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)' +    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'      _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'      IE_NAME = u'canalplus.fr'      _TEST = { -        u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', -        u'file': u'889861.flv', -        u'md5': u'590a888158b5f0d6832f84001fbf3e99', +        u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', +        u'file': u'922470.flv',          u'info_dict': { -            u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', -            u'upload_date': u'20130620', +            u'title': u'Zapping - 26/08/13', +            u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', +            u'upload_date': u'20130826', +        }, +        u'params': { +            u'skip_download': True,          }, -        u'skip': u'Requires rtmpdump'      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        if video_id is None: +            webpage = self._download_webpage(url, mobj.group('path')) +            video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')          info_url = self._VIDEO_INFO_TEMPLATE % video_id          info_page = self._download_webpage(info_url,video_id,                                              u'Downloading video info') @@ -43,4 +49,6 @@ class CanalplusIE(InfoExtractor):                  'ext': 'flv',                  'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),                  'thumbnail': media.find('IMAGES/GRAND').text, +                'description': infos.find('DESCRIPTION').text, +                'view_count': int(infos.find('NB_VUES').text),                  } diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 64b465805..765cb1f37 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor):              'width': int(fe.find('./width').text),              'height': int(fe.find('./height').text),              'url': fe.find('./url').text, +            'ext': determine_ext(fe.find('./url').text),              'filesize': int(fe.find('./filesize').text),              'video_bitrate': int(fe.find('./videoBitrate').text),              '3sat_qualityname': fe.find('./quality').text, @@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor):          }          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = determine_ext(formats[-1]['url']) +        info.update(formats[-1]) -        return info
\ No newline at end of file +        return info diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4508f0dfa..f3d86a711 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"', +        video_url = self._search_regex(r'type="video/mp4" src="(.*?)"',              webpage, u'video URL', flags=re.DOTALL)          info = { diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 7585b7061..cd3bbe65f 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ class GameSpotIE(InfoExtractor):          u"file": u"6410818.mp4",          u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",          u"info_dict": { -            u"title": u"Arma III - Community Guide: SITREP I", +            u"title": u"Arma 3 - Community Guide: SITREP I",              u"upload_date": u"20130627",           }      } diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index f1cd88983..8895ad289 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor):          self.report_extraction(video_id)          # Extract update date -        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', +        upload_date = self._html_search_regex( +            ['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'],              webpage, u'upload date', fatal=False)          if upload_date:              # Convert timestring to a format suitable for filename diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py new file mode 100644 index 000000000..50bc883ef --- /dev/null +++ b/youtube_dl/extractor/kickstarter.py @@ -0,0 +1,37 @@ +import re + +from .common import InfoExtractor + + +class KickStarterIE(InfoExtractor): +    _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*' +    _TEST = { +        u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location", +        u"file": u"1404461844.mp4", +        u"md5": u"c81addca81327ffa66c642b5d8b08cab", +        u"info_dict": { +            u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling", +        }, +    } + +    def _real_extract(self, url): +        m = re.match(self._VALID_URL, url) +        video_id = m.group('id') +        webpage_src = self._download_webpage(url, video_id) + +        video_url = self._search_regex(r'data-video="(.*?)">', +            webpage_src, u'video URL') +        if 'mp4' in video_url: +            ext = 'mp4' +        else: +            ext = 'flv' +        video_title = self._html_search_regex(r"<title>(.*?)</title>", +            webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip() + +        results = [{ +                    'id': video_id, +                    'url': video_url, +                    'title': video_title, +                    'ext': ext, +                    }] +        return results diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 8245b5583..a200dcd74 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,34 +5,27 @@ import socket  from .common import InfoExtractor  from ..utils import (      compat_http_client, -    compat_str,      compat_urllib_error,      compat_urllib_request, - -    ExtractorError, +    unified_strdate,  )  class MixcloudIE(InfoExtractor): -    _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/      _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'      IE_NAME = u'mixcloud' -    def report_download_json(self, file_id): -        """Report JSON download.""" -        self.to_screen(u'Downloading json') - -    def get_urls(self, jsonData, fmt, bitrate='best'): -        """Get urls from 'audio_formats' section in json""" -        try: -            bitrate_list = jsonData[fmt] -            if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: -                bitrate = max(bitrate_list) # select highest - -            url_list = jsonData[fmt][bitrate] -        except TypeError: # we have no bitrate info. -            url_list = jsonData[fmt] -        return url_list +    _TEST = { +        u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', +        u'file': u'dholbach-cryptkeeper.mp3', +        u'info_dict': { +            u'title': u'Cryptkeeper', +            u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', +            u'uploader': u'Daniel Holbach', +            u'uploader_id': u'dholbach', +            u'upload_date': u'20111115', +        }, +    }      def check_urls(self, url_list):          """Returns 1st active url from list""" @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor):          return None -    def _print_formats(self, formats): -        print('Available formats:') -        for fmt in formats.keys(): -            for b in formats[fmt]: -                try: -                    ext = formats[fmt][b][0] -                    print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) -                except TypeError: # we have no bitrate info -                    ext = formats[fmt][0] -                    print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) -                    break -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) -        # extract uploader & filename from url -        uploader = mobj.group(1).decode('utf-8') -        file_id = uploader + "-" + mobj.group(2).decode('utf-8') - -        # construct API request -        file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' -        # retrieve .json file with links to files -        request = compat_urllib_request.Request(file_url) -        try: -            self.report_download_json(file_url) -            jsonData = compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) - -        # parse JSON -        json_data = json.loads(jsonData) -        player_url = json_data['player_swf_url'] -        formats = dict(json_data['audio_formats']) - -        req_format = self._downloader.params.get('format', None) - -        if self._downloader.params.get('listformats', None): -            self._print_formats(formats) -            return - -        if req_format is None or req_format == 'best': -            for format_param in formats.keys(): -                url_list = self.get_urls(formats, format_param) -                # check urls -                file_url = self.check_urls(url_list) -                if file_url is not None: -                    break # got it! -        else: -            if req_format not in formats: -                raise ExtractorError(u'Format is not available') - -            url_list = self.get_urls(formats, req_format) -            file_url = self.check_urls(url_list) -            format_param = req_format -        return [{ -            'id': file_id.decode('utf-8'), -            'url': file_url.decode('utf-8'), -            'uploader': uploader.decode('utf-8'), -            'upload_date': None, -            'title': json_data['name'], -            'ext': file_url.split('.')[-1].decode('utf-8'), -            'format': (format_param is None and u'NA' or format_param.decode('utf-8')), -            'thumbnail': json_data['thumbnail_url'], -            'description': json_data['description'], -            'player_url': player_url.decode('utf-8'), -        }] +        uploader = mobj.group(1) +        cloudcast_name = mobj.group(2) +        track_id = '-'.join((uploader, cloudcast_name)) +        api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) +        webpage = self._download_webpage(url, track_id) +        json_data = self._download_webpage(api_url, track_id, +            u'Downloading cloudcast info') +        info = json.loads(json_data) + +        preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') +        song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') +        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) +        final_song_url = self.check_urls(template_url % i for i in range(30)) + +        return { +            'id': track_id, +            'title': info['name'], +            'url': final_song_url, +            'ext': 'mp3', +            'description': info['description'], +            'thumbnail': info['pictures'].get('extra_large'), +            'uploader': info['user']['name'], +            'uploader_id': info['user']['username'], +            'upload_date': unified_strdate(info['created_time']), +            'view_count': info['play_count'], +        } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5f3a5540d..29cd5617c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,10 +1,12 @@  import json  import re +import itertools  from .common import InfoExtractor  from ..utils import (      compat_str,      compat_urlparse, +    compat_urllib_parse,      ExtractorError,      unified_strdate, @@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor):      def _resolv_url(cls, url):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID -    def _extract_info_dict(self, info, full_title=None): +    def _extract_info_dict(self, info, full_title=None, quiet=False):          video_id = info['id']          name = full_title or video_id -        self.report_extraction(name) +        if quiet == False: +            self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None: @@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE):                  'id': info['id'],                  'title': info['title'],                  } + + +class SoundcloudUserIE(SoundcloudIE): +    _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' +    IE_NAME = u'soundcloud:user' + +    # it's in tests/test_playlists.py +    _TEST = None + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        uploader = mobj.group('user') + +        url = 'http://soundcloud.com/%s/' % uploader +        resolv_url = self._resolv_url(url) +        user_json = self._download_webpage(resolv_url, uploader, +            u'Downloading user info') +        user = json.loads(user_json) + +        tracks = [] +        for i in itertools.count(): +            data = compat_urllib_parse.urlencode({'offset': i*50, +                                                  'client_id': self._CLIENT_ID, +                                                  }) +            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data +            response = self._download_webpage(tracks_url, uploader,  +                u'Downloading tracks page %s' % (i+1)) +            new_tracks = json.loads(response) +            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks) +            if len(new_tracks) < 50: +                break + +        return { +            '_type': 'playlist', +            'id': compat_str(user['id']), +            'title': user['username'], +            'entries': tracks, +        } diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py new file mode 100644 index 000000000..a5dc754dd --- /dev/null +++ b/youtube_dl/extractor/southparkstudios.py @@ -0,0 +1,34 @@ +import re + +from .mtv import MTVIE, _media_xml_tag + + +class SouthParkStudiosIE(MTVIE): +    IE_NAME = u'southparkstudios.com' +    _VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P<id>\d+)' + +    _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + +    _TEST = { +        u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', +        u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', +        u'info_dict': { +            u'title': u'Bat Daded', +            u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', +        }, +    } + +    # Overwrite MTVIE properties we don't want +    _TESTS = [] + +    def _get_thumbnail_url(self, uri, itemdoc): +        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) +        return itemdoc.find(search_path).attrib['url'] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', +                                  webpage, u'mgid') +        return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 97215f289..90de7de3a 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor):      @property      def _have_to_download_any_subtitles(self):          return any([self._downloader.params.get('writesubtitles', False), -                    self._downloader.params.get('writeautomaticsub'), -                    self._downloader.params.get('allsubtitles', False)]) +                    self._downloader.params.get('writeautomaticsub')])      def _list_available_subtitles(self, video_id, webpage=None):          """ outputs the available subtitles for the video """ @@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor):          available_subs_list = {}          if self._downloader.params.get('writeautomaticsub', False):              available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) -        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): +        if self._downloader.params.get('writesubtitles', False):              available_subs_list.update(self._get_available_subtitles(video_id))          if not available_subs_list:  # error, it didn't get the available subtitles diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index f278951ba..0bf028f61 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor):              {                  'format': fnode.text,                  'url': video_url_template % fnode.text, +                'ext': fnode.text.partition('-')[0]              }              for fnode in format_doc.findall('./formats/format') @@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor):          }          # TODO: Remove when #980 has been merged -        info['url'] = formats[-1]['url'] -        info['ext'] = formats[-1]['format'].partition('-')[0] +        info.update(formats[-1])          return info diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 5f423870a..74c82587f 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -1,6 +1,11 @@ +import json  import re  from .common import InfoExtractor +from ..utils import ( +    compat_urlparse, +    get_meta_content, +)  class UstreamIE(InfoExtractor): @@ -43,3 +48,25 @@ class UstreamIE(InfoExtractor):                  'thumbnail': thumbnail,                 }          return info + +class UstreamChannelIE(InfoExtractor): +    _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)' +    IE_NAME = u'ustream:channel' + +    def _real_extract(self, url): +        m = re.match(self._VALID_URL, url) +        slug = m.group('slug') +        webpage = self._download_webpage(url, slug) +        channel_id = get_meta_content('ustream:channel_id', webpage) + +        BASE = 'http://www.ustream.tv' +        next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id +        video_ids = [] +        while next_url: +            reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id)) +            video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data'])) +            next_url = reply['nextUrl'] + +        urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids] +        url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls] +        return self.playlist_result(url_entries, channel_id) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 88b8b6be0..fa759d30c 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id +        mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id          webpage = self._download_webpage(mrss_url, video_id)          mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e0d70eaf..e4a2e22bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                       (                           (?:https?://)?                                       # http(s):// (optional)                           (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| -                            tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains +                            tube\.majestyc\.net/| +                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls                           (?:                                                  # the various things that can precede the ID:                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/ @@ -434,7 +435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          elif len(s) == 83:              return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]          elif len(s) == 82: -            return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82] +            return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]          elif len(s) == 81:              return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]          elif len(s) == 80: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 201802cee..5558d4737 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -249,7 +249,17 @@ def htmlentity_transform(matchobj):      return (u'&%s;' % entity)  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class AttrParser(compat_html_parser.HTMLParser): +class BaseHTMLParser(compat_html_parser.HTMLParser): +    def __init(self): +        compat_html_parser.HTMLParser.__init__(self) +        self.html = None + +    def loads(self, html): +        self.html = html +        self.feed(html) +        self.close() + +class AttrParser(BaseHTMLParser):      """Modified HTMLParser that isolates a tag with the specified attribute"""      def __init__(self, attribute, value):          self.attribute = attribute @@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser):          self.result = None          self.started = False          self.depth = {} -        self.html = None          self.watch_startpos = False          self.error_count = 0 -        compat_html_parser.HTMLParser.__init__(self) +        BaseHTMLParser.__init__(self)      def error(self, message):          if self.error_count > 10 or self.started: @@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser):          self.error_count += 1          self.goahead(1) -    def loads(self, html): -        self.html = html -        self.feed(html) -        self.close() -      def handle_starttag(self, tag, attrs):          attrs = dict(attrs)          if self.started: @@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html):          pass      return parser.get_result() +class MetaParser(BaseHTMLParser): +    """ +    Modified HTMLParser that isolates a meta tag with the specified name  +    attribute. +    """ +    def __init__(self, name): +        BaseHTMLParser.__init__(self) +        self.name = name +        self.content = None +        self.result = None + +    def handle_starttag(self, tag, attrs): +        if tag != 'meta': +            return +        attrs = dict(attrs) +        if attrs.get('name') == self.name: +            self.result = attrs.get('content') + +    def get_result(self): +        return self.result + +def get_meta_content(name, html): +    """ +    Return the content attribute from the meta tag with the given name attribute. +    """ +    parser = MetaParser(name) +    try: +        parser.loads(html) +    except compat_html_parser.HTMLParseError: +        pass +    return parser.get_result() +  def clean_html(html):      """Clean an HTML snippet into a readable string""" @@ -664,7 +700,16 @@ def unified_strdate(date_str):      date_str = date_str.replace(',',' ')      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) -    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] +    format_expressions = [ +        '%d %B %Y', +        '%B %d %Y', +        '%b %d %Y', +        '%Y-%m-%d', +        '%d/%m/%Y', +        '%Y/%m/%d %H:%M:%S', +        '%d.%m.%Y %H:%M', +        '%Y-%m-%dT%H:%M:%SZ', +    ]      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')  | 
