diff options
| -rw-r--r-- | AUTHORS | 1 | ||||
| -rw-r--r-- | test/test_utils.py | 9 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 47 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 19 | ||||
| -rw-r--r-- | youtube_dl/extractor/gdcvault.py | 33 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 135 | ||||
| -rw-r--r-- | youtube_dl/extractor/vidme.py | 37 | ||||
| -rw-r--r-- | youtube_dl/extractor/viewster.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 142 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 17 | 
11 files changed, 361 insertions, 92 deletions
| @@ -135,3 +135,4 @@ Bernhard Minks  sceext  Zach Bruggeman  Tjark Saul +slangangular diff --git a/test/test_utils.py b/test/test_utils.py index 65692a9fb..a759b2da9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -235,12 +235,21 @@ class TestUtil(unittest.TestCase):              <node x="a"/>              <node x="a" y="c" />              <node x="b" y="d" /> +            <node x="" />          </root>'''          doc = xml.etree.ElementTree.fromstring(testxml) +        self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)          self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) +        self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None) +        self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None) +        self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1])          self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) +        self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3]) +        self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2])          self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) +        self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3]) +        self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4])      def test_xpath_with_ns(self):          testxml = '''<root xmlns:media="http://example.com/"> diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0c57c7aeb..e4b9286c0 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -43,6 +43,11 @@ except ImportError:  # Python 2      import cookielib as compat_cookiejar  try: +    import http.cookies as compat_cookies +except ImportError:  # Python 2 +    import Cookie as compat_cookies + +try:      import html.entities as compat_html_entities  except ImportError:  # Python 2      import htmlentitydefs as compat_html_entities @@ -436,6 +441,7 @@ __all__ = [      'compat_basestring',      'compat_chr',      'compat_cookiejar', +    'compat_cookies',      'compat_expanduser',      'compat_get_terminal_size',      'compat_getenv', diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 01d07c9c0..9a1b6e3dc 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -20,7 +20,9 @@ class BBCCoUkIE(InfoExtractor):      IE_DESC = 'BBC iPlayer'      _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' -    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' +    _MEDIASELECTOR_URLS = [ +        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', +    ]      _TESTS = [          { @@ -162,6 +164,10 @@ class BBCCoUkIE(InfoExtractor):          }      ] +    class MediaSelectionError(Exception): +        def __init__(self, id): +            self.id = id +      def _extract_asx_playlist(self, connection, programme_id):          asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')          return [ref.get('href') for ref in asx.findall('./Entry/ref')] @@ -212,8 +218,7 @@ class BBCCoUkIE(InfoExtractor):      def _extract_medias(self, media_selection):          error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')          if error is not None: -            raise ExtractorError( -                '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) +            raise BBCCoUkIE.MediaSelectionError(error.get('id'))          return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')      def _extract_connections(self, media): @@ -270,9 +275,23 @@ class BBCCoUkIE(InfoExtractor):              ]          return subtitles +    def _raise_extractor_error(self, media_selection_error): +        raise ExtractorError( +            '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), +            expected=True) +      def _download_media_selector(self, programme_id): -        return self._download_media_selector_url( -            self._MEDIASELECTOR_URL % programme_id, programme_id) +        last_exception = None +        for mediaselector_url in self._MEDIASELECTOR_URLS: +            try: +                return self._download_media_selector_url( +                    mediaselector_url % programme_id, programme_id) +            except BBCCoUkIE.MediaSelectionError as e: +                if e.id == 'notukerror': +                    last_exception = e +                    continue +                self._raise_extractor_error(e) +        self._raise_extractor_error(last_exception)      def _download_media_selector_url(self, url, programme_id=None):          try: @@ -297,7 +316,6 @@ class BBCCoUkIE(InfoExtractor):                  formats.extend(self._extract_video(media, programme_id))              elif kind == 'captions':                  subtitles = self.extract_subtitles(media, programme_id) -          return formats, subtitles      def _download_playlist(self, playlist_id): @@ -426,9 +444,14 @@ class BBCIE(BBCCoUkIE):      IE_DESC = 'BBC'      _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' -    # fails with notukerror for some videos -    # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' -    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' +    _MEDIASELECTOR_URLS = [ +        # Provides more formats, namely direct mp4 links, but fails on some videos with +        # notukerror for non UK (?) users (e.g. +        # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) +        'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', +        # Provides fewer formats, but works everywhere for everybody (hopefully) +        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', +    ]      _TESTS = [{          # article with multiple videos embedded with data-media-meta containing @@ -463,7 +486,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/news/world-europe-32041533',          'info_dict': {              'id': 'p02mprgb', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',              'duration': 47,              'timestamp': 1427219242, @@ -523,7 +546,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',          'info_dict': {              'id': 'p018zqqg', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Hyundai Santa Fe Sport: Rock star',              'description': 'md5:b042a26142c4154a6e472933cf20793d',              'timestamp': 1368473503, @@ -538,7 +561,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/sport/0/football/33653409',          'info_dict': {              'id': 'p02xycnp', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',              'description': 'md5:398fca0e2e701c609d726e034fa1fc89',              'duration': 140, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14b9b4fe2..dc5080504 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,10 +14,12 @@ import xml.etree.ElementTree  from ..compat import (      compat_cookiejar, +    compat_cookies,      compat_HTTPError,      compat_http_client,      compat_urllib_error,      compat_urllib_parse_urlparse, +    compat_urllib_request,      compat_urlparse,      compat_str,  ) @@ -181,6 +183,7 @@ class InfoExtractor(object):                      by YoutubeDL if it's missing)      categories:     A list of categories that the video falls in, for example                      ["Sports", "Berlin"] +    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]      is_live:        True, False, or None (=unknown). Whether this video is a                      live stream that goes on instead of a fixed-length video.      start_time:     Time in seconds where the reproduction should start, as @@ -630,6 +633,12 @@ class InfoExtractor(object):              template % (content_re, property_re),          ] +    @staticmethod +    def _meta_regex(prop): +        return r'''(?isx)<meta +                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) +                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) +      def _og_search_property(self, prop, html, name=None, **kargs):          if name is None:              name = 'OpenGraph %s' % prop @@ -660,9 +669,7 @@ class InfoExtractor(object):          if display_name is None:              display_name = name          return self._html_search_regex( -            r'''(?isx)<meta -                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) -                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), +            self._meta_regex(name),              html, display_name, fatal=fatal, group='content', **kwargs)      def _dc_search_uploader(self, html): @@ -1069,6 +1076,12 @@ class InfoExtractor(object):              None, '/', True, False, expire_time, '', None, None, None)          self._downloader.cookiejar.set_cookie(cookie) +    def _get_cookies(self, url): +        """ Return a compat_cookies.SimpleCookie with the cookies for the url """ +        req = compat_urllib_request.Request(url) +        self._downloader.cookiejar.add_cookie_header(req) +        return compat_cookies.SimpleCookie(req.get_header('Cookie')) +      def get_testcases(self, include_onlymatching=False):          t = getattr(self, '_TEST', None)          if t: diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 43f916412..a6834db43 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,7 +7,10 @@ from ..compat import (      compat_urllib_parse,      compat_urllib_request,  ) -from ..utils import remove_end +from ..utils import ( +    remove_end, +    HEADRequest, +)  class GDCVaultIE(InfoExtractor): @@ -73,10 +76,20 @@ class GDCVaultIE(InfoExtractor):          return video_formats      def _parse_flv(self, xml_description): -        video_formats = [] +        formats = []          akamai_url = xml_description.find('./metadata/akamaiHost').text +        audios = xml_description.find('./metadata/audios') +        if audios is not None: +            for audio in audios: +                formats.append({ +                    'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, +                    'play_path': remove_end(audio.get('url'), '.flv'), +                    'ext': 'flv', +                    'vcodec': 'none', +                    'format_id': audio.get('code'), +                })          slide_video_path = xml_description.find('./metadata/slideVideo').text -        video_formats.append({ +        formats.append({              'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,              'play_path': remove_end(slide_video_path, '.flv'),              'ext': 'flv', @@ -86,7 +99,7 @@ class GDCVaultIE(InfoExtractor):              'format_id': 'slides',          })          speaker_video_path = xml_description.find('./metadata/speakerVideo').text -        video_formats.append({ +        formats.append({              'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,              'play_path': remove_end(speaker_video_path, '.flv'),              'ext': 'flv', @@ -95,7 +108,7 @@ class GDCVaultIE(InfoExtractor):              'preference': -1,              'format_id': 'speaker',          }) -        return video_formats +        return formats      def _login(self, webpage_url, display_id):          (username, password) = self._get_login_info() @@ -133,16 +146,18 @@ class GDCVaultIE(InfoExtractor):              r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',              start_page, 'url', default=None)          if direct_url: -            video_url = 'http://www.gdcvault.com/' + direct_url              title = self._html_search_regex(                  r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>',                  start_page, 'title') +            video_url = 'http://www.gdcvault.com' + direct_url +            # resolve the url so that we can detect the correct extension +            head = self._request_webpage(HEADRequest(video_url), video_id) +            video_url = head.geturl()              return {                  'id': video_id,                  'display_id': display_id,                  'url': video_url, -                'ext': 'flv',                  'title': title,              } @@ -168,8 +183,8 @@ class GDCVaultIE(InfoExtractor):              # Fallback to the older format              xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') -        xml_decription_url = xml_root + 'xml/' + xml_name -        xml_description = self._download_xml(xml_decription_url, display_id) +        xml_description_url = xml_root + 'xml/' + xml_name +        xml_description = self._download_xml(xml_description_url, display_id)          video_title = xml_description.find('./metadata/title').text          video_formats = self._parse_mp4(xml_description) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 0a6c9fe72..6ce86cbcd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):      _VALID_URL = r'''(?x)^(?:https?://)?                      (?:(?:(?:www\.|m\.)?soundcloud\.com/                              (?P<uploader>[\w\d-]+)/ -                            (?!sets/|(?:likes|tracks)/?(?:$|[?#])) +                            (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#]))                              (?P<title>[\w\d-]+)/?                              (?P<token>[^?]+?)?(?:[?].*)?$)                         |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -293,60 +293,139 @@ class SoundcloudSetIE(SoundcloudIE):  class SoundcloudUserIE(SoundcloudIE): -    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' +    _VALID_URL = r'''(?x) +                        https?:// +                            (?:(?:www|m)\.)?soundcloud\.com/ +                            (?P<user>[^/]+) +                            (?:/ +                                (?P<rsrc>tracks|sets|reposts|likes|spotlight) +                            )? +                            /?(?:[?#].*)?$ +                    '''      IE_NAME = 'soundcloud:user'      _TESTS = [{ -        'url': 'https://soundcloud.com/the-concept-band', +        'url': 'https://soundcloud.com/the-akashic-chronicler',          'info_dict': { -            'id': '9615865', -            'title': 'The Royal Concept', +            'id': '114582580', +            'title': 'The Akashic Chronicler (All)',          }, -        'playlist_mincount': 12 +        'playlist_mincount': 112,      }, { -        'url': 'https://soundcloud.com/the-concept-band/likes', +        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',          'info_dict': { -            'id': '9615865', -            'title': 'The Royal Concept', +            'id': '114582580', +            'title': 'The Akashic Chronicler (Tracks)',          }, -        'playlist_mincount': 1, +        'playlist_mincount': 50,      }, { -        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', -        'only_matching': True, +        'url': 'https://soundcloud.com/the-akashic-chronicler/sets', +        'info_dict': { +            'id': '114582580', +            'title': 'The Akashic Chronicler (Playlists)', +        }, +        'playlist_mincount': 3, +    }, { +        'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', +        'info_dict': { +            'id': '114582580', +            'title': 'The Akashic Chronicler (Reposts)', +        }, +        'playlist_mincount': 9, +    }, { +        'url': 'https://soundcloud.com/the-akashic-chronicler/likes', +        'info_dict': { +            'id': '114582580', +            'title': 'The Akashic Chronicler (Likes)', +        }, +        'playlist_mincount': 333, +    }, { +        'url': 'https://soundcloud.com/grynpyret/spotlight', +        'info_dict': { +            'id': '7098329', +            'title': 'Grynpyret (Spotlight)', +        }, +        'playlist_mincount': 1,      }] +    _API_BASE = 'https://api.soundcloud.com' +    _API_V2_BASE = 'https://api-v2.soundcloud.com' + +    _BASE_URL_MAP = { +        'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, +        'tracks': '%s/users/%%s/tracks' % _API_BASE, +        'sets': '%s/users/%%s/playlists' % _API_V2_BASE, +        'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, +        'likes': '%s/users/%%s/likes' % _API_V2_BASE, +        'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, +    } + +    _TITLE_MAP = { +        'all': 'All', +        'tracks': 'Tracks', +        'sets': 'Playlists', +        'reposts': 'Reposts', +        'likes': 'Likes', +        'spotlight': 'Spotlight', +    } +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          uploader = mobj.group('user') -        resource = mobj.group('rsrc') -        if resource is None: -            resource = 'tracks' -        elif resource == 'likes': -            resource = 'favorites'          url = 'http://soundcloud.com/%s/' % uploader          resolv_url = self._resolv_url(url)          user = self._download_json(              resolv_url, uploader, 'Downloading user info') -        base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource) + +        resource = mobj.group('rsrc') or 'all' +        base_url = self._BASE_URL_MAP[resource] % user['id'] + +        next_href = None          entries = []          for i in itertools.count(): -            data = compat_urllib_parse.urlencode({ -                'offset': i * 50, -                'limit': 50, -                'client_id': self._CLIENT_ID, -            }) -            new_entries = self._download_json( -                base_url + data, uploader, 'Downloading track page %s' % (i + 1)) -            if len(new_entries) == 0: +            if not next_href: +                data = compat_urllib_parse.urlencode({ +                    'offset': i * 50, +                    'limit': 50, +                    'client_id': self._CLIENT_ID, +                    'linked_partitioning': '1', +                    'representation': 'speedy', +                }) +                next_href = base_url + '?' + data + +            response = self._download_json( +                next_href, uploader, 'Downloading track page %s' % (i + 1)) + +            collection = response['collection'] + +            if not collection:                  self.to_screen('%s: End page received' % uploader)                  break -            entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries) + +            def resolve_permalink_url(candidates): +                for cand in candidates: +                    if isinstance(cand, dict): +                        permalink_url = cand.get('permalink_url') +                        if permalink_url and permalink_url.startswith('http'): +                            return permalink_url + +            for e in collection: +                permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) +                if permalink_url: +                    entries.append(self.url_result(permalink_url)) + +            if 'next_href' in response: +                next_href = response['next_href'] +                if not next_href: +                    break +            else: +                next_href = None          return {              '_type': 'playlist',              'id': compat_str(user['id']), -            'title': user['username'], +            'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),              'entries': entries,          } diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e0b55078b..157bb74fe 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -22,6 +22,27 @@ class VidmeIE(InfoExtractor):              'timestamp': 1406313244,              'upload_date': '20140725',              'thumbnail': 're:^https?://.*\.jpg', +            'view_count': int, +            'like_count': int, +        }, +    }, { +        # tests uploader field +        'url': 'https://vid.me/4Iib', +        'info_dict': { +            'id': '4Iib', +            'ext': 'mp4', +            'title': 'The Carver', +            'description': 'md5:e9c24870018ae8113be936645b93ba3c', +            'duration': 97.859999999999999, +            'timestamp': 1433203629, +            'upload_date': '20150602', +            'uploader': 'Thomas', +            'thumbnail': 're:^https?://.*\.jpg', +            'view_count': int, +            'like_count': int, +        }, +        'params': { +            'skip_download': True,          },      }, {          # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching @@ -40,16 +61,23 @@ class VidmeIE(InfoExtractor):          title = self._og_search_title(webpage)          description = self._og_search_description(webpage, default='')          thumbnail = self._og_search_thumbnail(webpage) -        timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False)) -        width = int_or_none(self._og_search_property('video:width', webpage, fatal=False)) -        height = int_or_none(self._og_search_property('video:height', webpage, fatal=False)) +        timestamp = int_or_none(self._og_search_property( +            'updated_time', webpage, fatal=False)) +        width = int_or_none(self._og_search_property( +            'video:width', webpage, fatal=False)) +        height = int_or_none(self._og_search_property( +            'video:height', webpage, fatal=False))          duration = float_or_none(self._html_search_regex(              r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))          view_count = str_to_int(self._html_search_regex( -            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) +            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', +            webpage, 'view count', fatal=False))          like_count = str_to_int(self._html_search_regex(              r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',              webpage, 'like count', fatal=False)) +        uploader = self._html_search_regex( +            'class="video_author_username"[^>]*>([^<]+)', +            webpage, 'uploader', default=None)          return {              'id': video_id, @@ -63,4 +91,5 @@ class VidmeIE(InfoExtractor):              'duration': duration,              'view_count': view_count,              'like_count': like_count, +            'uploader': uploader,          } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6ef36290b..cda02ba24 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -5,11 +5,13 @@ from .common import InfoExtractor  from ..compat import (      compat_urllib_request,      compat_urllib_parse, +    compat_urllib_parse_unquote,  )  from ..utils import (      determine_ext,      int_or_none,      parse_iso8601, +    HEADRequest,  ) @@ -62,7 +64,6 @@ class ViewsterIE(InfoExtractor):      }]      _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' -    _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA=='      def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):          request = compat_urllib_request.Request(url) @@ -72,6 +73,10 @@ class ViewsterIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) +        # Get 'api_token' cookie +        self._request_webpage(HEADRequest(url), video_id) +        cookies = self._get_cookies(url) +        self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value)          info = self._download_json(              'https://public-api.viewster.com/search/%s' % video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e411bfb6..67a1df9a0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,9 +33,11 @@ from ..utils import (      int_or_none,      orderedSet,      parse_duration, +    smuggle_url,      str_to_int,      unescapeHTML,      unified_strdate, +    unsmuggle_url,      uppercase_escape,      ISO3166Utils,  ) @@ -329,6 +331,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'upload_date': '20121002',                  'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',                  'categories': ['Science & Technology'], +                'tags': ['youtube-dl'],                  'like_count': int,                  'dislike_count': int,                  'start_time': 1, @@ -343,7 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20120506',                  'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', -                'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', +                'description': 'md5:782e8651347686cba06e58f71ab51773', +                'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', +                         'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', +                         'iconic ep', 'iconic', 'love', 'it'],                  'uploader': 'Icona Pop',                  'uploader_id': 'IconaPop',              } @@ -558,6 +564,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'format': '135',  # bestvideo              }          }, +        { +            # Multifeed videos (multiple cameras), URL is for Main Camera +            'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', +            'info_dict': { +                'id': 'jqWvoWXjCVs', +                'title': 'teamPGP: Rocket League Noob Stream', +                'description': 'md5:dc7872fb300e143831327f1bae3af010', +            }, +            'playlist': [{ +                'info_dict': { +                    'id': 'jqWvoWXjCVs', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }, { +                'info_dict': { +                    'id': '6h8e8xoXJzg', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }, { +                'info_dict': { +                    'id': 'PUOgX5z9xZw', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (grizzle)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }, { +                'info_dict': { +                    'id': 'teuwxikvS5k', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (zim)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }], +            'params': { +                'skip_download': True, +            }, +        }      ]      def __init__(self, *args, **kwargs): @@ -889,6 +948,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          return formats      def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +          proto = (              'http' if self._downloader.params.get('prefer_insecure', False)              else 'https') @@ -1005,6 +1066,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      '"token" parameter not in video info for unknown reason',                      video_id=video_id) +        # title +        if 'title' in video_info: +            video_title = video_info['title'][0] +        else: +            self._downloader.report_warning('Unable to extract video title') +            video_title = '_' + +        # description +        video_description = get_element_by_id("eow-description", video_webpage) +        if video_description: +            video_description = re.sub(r'''(?x) +                <a\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    title="([^"]+)"\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    class="yt-uix-redirect-link"\s*> +                [^<]+ +                </a> +            ''', r'\1', video_description) +            video_description = clean_html(video_description) +        else: +            fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) +            if fd_mobj: +                video_description = unescapeHTML(fd_mobj.group(1)) +            else: +                video_description = '' + +        if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): +            if not self._downloader.params.get('noplaylist'): +                entries = [] +                feed_ids = [] +                multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) +                for feed in multifeed_metadata_list.split(','): +                    feed_data = compat_parse_qs(feed) +                    entries.append({ +                        '_type': 'url_transparent', +                        'ie_key': 'Youtube', +                        'url': smuggle_url( +                            '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), +                            {'force_singlefeed': True}), +                        'title': '%s (%s)' % (video_title, feed_data['title'][0]), +                    }) +                    feed_ids.append(feed_data['id'][0]) +                self.to_screen( +                    'Downloading multifeed video (%s) - add --no-playlist to just download video %s' +                    % (', '.join(feed_ids), video_id)) +                return self.playlist_result(entries, video_id, video_title, video_description) +            self.to_screen('Downloading just video %s because of --no-playlist' % video_id) +          if 'view_count' in video_info:              view_count = int(video_info['view_count'][0])          else: @@ -1030,13 +1140,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              self._downloader.report_warning('unable to extract uploader nickname') -        # title -        if 'title' in video_info: -            video_title = video_info['title'][0] -        else: -            self._downloader.report_warning('Unable to extract video title') -            video_title = '_' -          # thumbnail image          # We try first to get a high quality image:          m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -1072,25 +1175,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              video_categories = None -        # description -        video_description = get_element_by_id("eow-description", video_webpage) -        if video_description: -            video_description = re.sub(r'''(?x) -                <a\s+ -                    (?:[a-zA-Z-]+="[^"]+"\s+)*? -                    title="([^"]+)"\s+ -                    (?:[a-zA-Z-]+="[^"]+"\s+)*? -                    class="yt-uix-redirect-link"\s*> -                [^<]+ -                </a> -            ''', r'\1', video_description) -            video_description = clean_html(video_description) -        else: -            fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) -            if fd_mobj: -                video_description = unescapeHTML(fd_mobj.group(1)) -            else: -                video_description = '' +        video_tags = [ +            unescapeHTML(m.group('content')) +            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]          def _extract_count(count_name):              return str_to_int(self._search_regex( @@ -1260,6 +1347,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'thumbnail': video_thumbnail,              'description': video_description,              'categories': video_categories, +            'tags': video_tags,              'subtitles': video_subtitles,              'automatic_captions': automatic_captions,              'duration': video_duration, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..78dc2b449 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn):  if sys.version_info >= (2, 7): -    def find_xpath_attr(node, xpath, key, val): +    def find_xpath_attr(node, xpath, key, val=None):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z-]+$', key) -        assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) -        expr = xpath + "[@%s='%s']" % (key, val) +        if val: +            assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) +        expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))          return node.find(expr)  else: -    def find_xpath_attr(node, xpath, key, val): +    def find_xpath_attr(node, xpath, key, val=None):          # Here comes the crazy part: In 2.6, if the xpath is a unicode,          # .//node does not match if a node is a direct child of . !          if isinstance(xpath, compat_str):              xpath = xpath.encode('ascii')          for f in node.findall(xpath): -            if f.attrib.get(key) == val: +            if key not in f.attrib: +                continue +            if val is None or f.attrib.get(key) == val:                  return f          return None @@ -576,11 +579,9 @@ class ContentTooShortError(Exception):      download is too small for what the server announced first, indicating      the connection was probably interrupted.      """ -    # Both in bytes -    downloaded = None -    expected = None      def __init__(self, downloaded, expected): +        # Both in bytes          self.downloaded = downloaded          self.expected = expected | 
