diff options
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | docs/supportedsites.md | 1 | ||||
| -rw-r--r-- | test/test_YoutubeDL.py | 11 | ||||
| -rw-r--r-- | test/test_subtitles.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/arte.py | 16 | ||||
| -rw-r--r-- | youtube_dl/extractor/cbsnews.py | 52 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/kuwo.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/spankbang.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/srgssr.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/vidme.py | 71 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 5 | ||||
| -rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 4 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 29 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
16 files changed, 206 insertions, 47 deletions
| @@ -455,6 +455,8 @@ The `-o` option allows users to indicate a template for the output file names. T   - `format_id`: The sequence will be replaced by the format code specified by `--format`.   - `duration`: The sequence will be replaced by the length of the video in seconds. +Note that some of the aforementioned sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`. +  The current default template is `%(title)s-%(id)s.%(ext)s`.  In some cases, you don't want special characters such as δΈ, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 61be9990d..ee34adf26 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -91,6 +91,7 @@   - **Canvas**   - **CBS**   - **CBSNews**: CBS News + - **CBSNewsLiveVideo**: CBS News Live Videos   - **CBSSports**   - **CeskaTelevize**   - **channel9**: Channel 9 diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 73910eaec..88c63010e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -248,6 +248,17 @@ class TestFormatSelection(unittest.TestCase):          def format_info(f_id):              info = YoutubeIE._formats[f_id].copy() + +            # XXX: In real cases InfoExtractor._parse_mpd() fills up 'acodec' +            # and 'vcodec', while in tests such information is incomplete since +            # commit a6c2c24479e5f4827ceb06f64d855329c0a6f593 +            # test_YoutubeDL.test_youtube_format_selection is broken without +            # this fix +            if 'acodec' in info and 'vcodec' not in info: +                info['vcodec'] = 'none' +            elif 'vcodec' in info and 'acodec' not in info: +                info['acodec'] = 'none' +              info['format_id'] = f_id              info['url'] = 'url:' + f_id              return info diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 9a695c4e8..27e763edd 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -65,16 +65,16 @@ class TestYoutubeSubtitles(BaseTestSubtitles):          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 13) -        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') -        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') -        for lang in ['it', 'fr', 'de']: +        self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') +        self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') +        for lang in ['fr', 'de']:              self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) -    def test_youtube_subtitles_sbv_format(self): +    def test_youtube_subtitles_ttml_format(self):          self.DL.params['writesubtitles'] = True -        self.DL.params['subtitlesformat'] = 'sbv' +        self.DL.params['subtitlesformat'] = 'ttml'          subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') +        self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54')      def test_youtube_subtitles_vtt_format(self):          self.DL.params['writesubtitles'] = True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e61a88de7..2fbc7f812 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -90,7 +90,10 @@ from .canalplus import CanalplusIE  from .canalc2 import Canalc2IE  from .canvas import CanvasIE  from .cbs import CBSIE -from .cbsnews import CBSNewsIE +from .cbsnews import ( +    CBSNewsIE, +    CBSNewsLiveVideoIE, +)  from .cbssports import CBSSportsIE  from .ccc import CCCIE  from .ceskatelevize import CeskaTelevizeIE @@ -819,7 +822,11 @@ from .videomore import (  )  from .videopremium import VideoPremiumIE  from .videott import VideoTtIE -from .vidme import VidmeIE +from .vidme import ( +    VidmeIE, +    VidmeUserIE, +    VidmeUserLikesIE, +)  from .vidzi import VidziIE  from .vier import VierIE, VierVideosIE  from .viewster import ViewsterIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b9e07f0ef..6ed855a57 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -13,6 +13,7 @@ from ..utils import (      unified_strdate,      get_element_by_attribute,      int_or_none, +    NO_DEFAULT,      qualities,  ) @@ -93,9 +94,18 @@ class ArteTVPlus7IE(InfoExtractor):          json_url = self._html_search_regex(              patterns, webpage, 'json vp url', default=None)          if not json_url: -            iframe_url = self._html_search_regex( -                r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', -                webpage, 'iframe url', group='url') +            def find_iframe_url(webpage, default=NO_DEFAULT): +                return self._html_search_regex( +                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', +                    webpage, 'iframe url', group='url', default=default) + +            iframe_url = find_iframe_url(webpage, None) +            if not iframe_url: +                embed_url = self._html_search_regex( +                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url') +                player = self._download_json( +                    embed_url, video_id, 'Downloading player page') +                iframe_url = find_iframe_url(player['html'])              json_url = compat_parse_qs(                  compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]          return self._extract_from_json_url(json_url, video_id, lang) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index cabf7e73b..8f864699f 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -1,15 +1,14 @@  # encoding: utf-8  from __future__ import unicode_literals -import re -import json - +from .common import InfoExtractor  from .theplatform import ThePlatformIE +from ..utils import parse_duration  class CBSNewsIE(ThePlatformIE):      IE_DESC = 'CBS News' -    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)' +    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'      _TESTS = [          { @@ -48,14 +47,13 @@ class CBSNewsIE(ThePlatformIE):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        video_info = json.loads(self._html_search_regex( +        video_info = self._parse_json(self._html_search_regex(              r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', -            webpage, 'video JSON info')) +            webpage, 'video JSON info'), video_id)          item = video_info['item'] if 'item' in video_info else video_info          title = item.get('articleTitle') or item.get('hed') @@ -88,3 +86,41 @@ class CBSNewsIE(ThePlatformIE):              'formats': formats,              'subtitles': subtitles,          } + + +class CBSNewsLiveVideoIE(InfoExtractor): +    IE_DESC = 'CBS News Live Videos' +    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' + +    _TEST = { +        'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', +        'info_dict': { +            'id': 'clinton-sanders-prepare-to-face-off-in-nh', +            'ext': 'flv', +            'title': 'Clinton, Sanders Prepare To Face Off In NH', +            'duration': 334, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_info = self._parse_json(self._html_search_regex( +            r'data-story-obj=\'({.+?})\'', webpage, 'video JSON info'), video_id)['story'] + +        hdcore_sign = 'hdcore=3.3.1' +        f4m_formats = self._extract_f4m_formats(video_info['url'] + '&' + hdcore_sign, video_id) +        if f4m_formats: +            for entry in f4m_formats: +                # URLs without the extra param induce an 404 error +                entry.update({'extra_param_to_segment_url': hdcore_sign}) + +        return { +            'id': video_id, +            'title': video_info['headline'], +            'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), +            'duration': parse_duration(video_info.get('segmentDur')), +            'formats': f4m_formats, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b18e734c4..c02fe201c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1229,19 +1229,24 @@ class GenericIE(InfoExtractor):          # Check for direct link to a video          content_type = head_response.headers.get('Content-Type', '') -        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) +        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)          if m:              upload_date = unified_strdate(                  head_response.headers.get('Last-Modified')) +            formats = [] +            if m.group('format_id').endswith('mpegurl'): +                formats = self._extract_m3u8_formats(url, video_id, 'mp4') +            else: +                formats = [{ +                    'format_id': m.group('format_id'), +                    'url': url, +                    'vcodec': 'none' if m.group('type') == 'audio' else None +                }]              return {                  'id': video_id,                  'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),                  'direct': True, -                'formats': [{ -                    'format_id': m.group('format_id'), -                    'url': url, -                    'vcodec': 'none' if m.group('type') == 'audio' else None -                }], +                'formats': formats,                  'upload_date': upload_date,              } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 0c8ed5d07..f641edef8 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -31,6 +31,10 @@ class KuwoBaseIE(InfoExtractor):                  (file_format['ext'], file_format.get('br', ''), song_id),                  song_id, note='Download %s url info' % file_format['format'],              ) + +            if song_url == 'IPDeny': +                raise ExtractorError('This song is blocked in this region', expected=True) +              if song_url.startswith('http://') or song_url.startswith('https://'):                  formats.append({                      'url': song_url, diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 3cfa671ed..50433d0f6 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -7,7 +7,7 @@ from .common import InfoExtractor  class SpankBangIE(InfoExtractor):      _VALID_URL = r'https?://(?:(?:www|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' -    _TEST = { +    _TESTS = [{          'url': 'http://spankbang.com/3vvn/video/fantasy+solo',          'md5': '1cc433e1d6aa14bc376535b8679302f7',          'info_dict': { @@ -19,7 +19,11 @@ class SpankBangIE(InfoExtractor):              'uploader': 'silly2587',              'age_limit': 18,          } -    } +    }, { +        # 480p only +        'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -34,7 +38,8 @@ class SpankBangIE(InfoExtractor):              'ext': 'mp4',              'format_id': '%sp' % height,              'height': int(height), -        } for height in re.findall(r'<(?:span|li)[^>]+q_(\d+)p', webpage)] +        } for height in re.findall(r'<(?:span|li|p)[^>]+[qb]_(\d+)p', webpage)] +        self._check_formats(formats, video_id)          self._sort_formats(formats)          title = self._html_search_regex( diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 4707029ca..246970c4d 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -70,14 +70,11 @@ class SRGSSRIE(InfoExtractor):                          asset_url, media_id, 'mp4', 'm3u8_native',                          m3u8_id=format_id, fatal=False))                  else: -                    ext = None -                    if protocol == 'RTMP': -                        ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext')                      formats.append({                          'format_id': format_id,                          'url': asset_url,                          'preference': preference(quality), -                        'ext': ext, +                        'ext': 'flv' if protocol == 'RTMP' else None,                      })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 3d63ed4f0..b1156d531 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,5 +1,7 @@  from __future__ import unicode_literals +import itertools +  from .common import InfoExtractor  from ..compat import compat_HTTPError  from ..utils import ( @@ -11,7 +13,8 @@ from ..utils import (  class VidmeIE(InfoExtractor): -    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)' +    IE_NAME = 'vidme' +    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{,5})(?:[^\da-zA-Z]|$)'      _TESTS = [{          'url': 'https://vid.me/QNB',          'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', @@ -202,3 +205,69 @@ class VidmeIE(InfoExtractor):              'comment_count': comment_count,              'formats': formats,          } + + +class VidmeListBaseIE(InfoExtractor): +    # Max possible limit according to https://docs.vid.me/#api-Videos-List +    _LIMIT = 100 + +    def _entries(self, user_id, user_name): +        for page_num in itertools.count(1): +            page = self._download_json( +                'https://api.vid.me/videos/%s?user=%s&limit=%d&offset=%d' +                % (self._API_ITEM, user_id, self._LIMIT, (page_num - 1) * self._LIMIT), +                user_name, 'Downloading user %s page %d' % (self._API_ITEM, page_num)) + +            videos = page.get('videos', []) +            if not videos: +                break + +            for video in videos: +                video_url = video.get('full_url') or video.get('embed_url') +                if video_url: +                    yield self.url_result(video_url, VidmeIE.ie_key()) + +            total = int_or_none(page.get('page', {}).get('total')) +            if total and self._LIMIT * page_num >= total: +                break + +    def _real_extract(self, url): +        user_name = self._match_id(url) + +        user_id = self._download_json( +            'https://api.vid.me/userByUsername?username=%s' % user_name, +            user_name)['user']['user_id'] + +        return self.playlist_result( +            self._entries(user_id, user_name), user_id, +            '%s - %s' % (user_name, self._TITLE)) + + +class VidmeUserIE(VidmeListBaseIE): +    IE_NAME = 'vidme:user' +    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' +    _API_ITEM = 'list' +    _TITLE = 'Videos' +    _TEST = { +        'url': 'https://vid.me/EFARCHIVE', +        'info_dict': { +            'id': '3834632', +            'title': 'EFARCHIVE - %s' % _TITLE, +        }, +        'playlist_mincount': 238, +    } + + +class VidmeUserLikesIE(VidmeListBaseIE): +    IE_NAME = 'vidme:user:likes' +    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' +    _API_ITEM = 'likes' +    _TITLE = 'Likes' +    _TEST = { +        'url': 'https://vid.me/ErinAlexis/likes', +        'info_dict': { +            'id': '6483530', +            'title': 'ErinAlexis - %s' % _TITLE, +        }, +        'playlist_mincount': 415, +    } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 828f5d1f4..63abe5477 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -369,6 +369,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # RTMP (unnamed)          '_rtmp': {'protocol': 'rtmp'},      } +    _SUBTITLE_FORMATS = ('ttml', 'vtt')      IE_NAME = 'youtube'      _TESTS = [ @@ -918,7 +919,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              if lang in sub_lang_list:                  continue              sub_formats = [] -            for ext in ['sbv', 'vtt', 'srt']: +            for ext in self._SUBTITLE_FORMATS:                  params = compat_urllib_parse.urlencode({                      'lang': lang,                      'v': video_id, @@ -988,7 +989,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              for lang_node in caption_list.findall('target'):                  sub_lang = lang_node.attrib['lang_code']                  sub_formats = [] -                for ext in ['sbv', 'vtt', 'srt']: +                for ext in self._SUBTITLE_FORMATS:                      params = compat_urllib_parse.urlencode({                          'lang': original_lang,                          'tlang': sub_lang, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 16a64802a..22d7ac65a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -391,6 +391,10 @@ class FFmpegMetadataPP(FFmpegPostProcessor):          for (name, value) in metadata.items():              options.extend(['-metadata', '%s=%s' % (name, value)]) +        # https://github.com/rg3/youtube-dl/issues/8350 +        if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False): +            options.extend(['-bsf:a', 'aac_adtstoasc']) +          self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)          self.run_ffmpeg(filename, temp_filename, options)          os.remove(encodeFilename(filename)) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c63b61598..4262ad6ac 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):          'ttaf1': 'http://www.w3.org/2006/10/ttaf1',      }) -    def parse_node(node): -        str_or_empty = functools.partial(str_or_none, default='') +    class TTMLPElementParser(object): +        out = '' -        out = str_or_empty(node.text) +        def start(self, tag, attrib): +            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): +                self.out += '\n' -        for child in node: -            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): -                out += '\n' + str_or_empty(child.tail) -            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): -                out += str_or_empty(parse_node(child)) -            else: -                out += str_or_empty(xml.etree.ElementTree.tostring(child)) +        def end(self, tag): +            pass -        return out +        def data(self, data): +            self.out += data + +        def close(self): +            return self.out.strip() + +    def parse_node(node): +        target = TTMLPElementParser() +        parser = xml.etree.ElementTree.XMLParser(target=target) +        parser.feed(xml.etree.ElementTree.tostring(node)) +        return parser.close()      dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))      out = [] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6da42c5a5..3fec14ab1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2016.02.01' +__version__ = '2016.02.05.1' | 
