diff options
47 files changed, 878 insertions, 280 deletions
| @@ -173,6 +173,10 @@ which means you can modify it, redistribute it or use it however you like.                                       expected filesize (experimental)      --hls-prefer-native              Use the native HLS downloader instead of                                       ffmpeg (experimental) +    --hls-use-mpegts                 Use the mpegts container for HLS videos, +                                     allowing to play the video while +                                     downloading (some players may not be able +                                     to play it)      --external-downloader COMMAND    Use the specified external downloader.                                       Currently supports                                       aria2c,axel,curl,httpie,wget diff --git a/docs/supportedsites.md b/docs/supportedsites.md index eb68c23b5..61be9990d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -55,6 +55,7 @@   - **audiomack**   - **audiomack:album**   - **Azubu** + - **AzubuLive**   - **BaiduVideo**: 百度视频   - **bambuser**   - **bambuser:channel** @@ -133,6 +134,8 @@   - **DailymotionCloud**   - **daum.net**   - **daum.net:clip** + - **daum.net:playlist** + - **daum.net:user**   - **DBTV**   - **DCN**   - **dcn:live** @@ -315,6 +318,7 @@   - **mailru**: Видео@Mail.Ru   - **MakerTV**   - **Malemotion** + - **MatchTV**   - **MDR**: MDR.DE and KiKA   - **media.ccc.de**   - **metacafe** @@ -507,6 +511,7 @@   - **Sapo**: SAPO Vídeos   - **savefrom.net**   - **SBS**: sbs.com.au + - **schooltv**   - **SciVee**   - **screen.yahoo:search**: Yahoo screen search   - **Screencast** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b53cfbe78..73910eaec 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -14,6 +14,7 @@ from test.helper import FakeYDL, assertRegexpMatches  from youtube_dl import YoutubeDL  from youtube_dl.compat import compat_str, compat_urllib_error  from youtube_dl.extractor import YoutubeIE +from youtube_dl.extractor.common import InfoExtractor  from youtube_dl.postprocessor.common import PostProcessor  from youtube_dl.utils import ExtractorError, match_filter_func @@ -646,6 +647,42 @@ class TestYoutubeDL(unittest.TestCase):          ydl = YDL()          self.assertRaises(compat_urllib_error.URLError, ydl.urlopen, 'file:///etc/passwd') +    def test_do_not_override_ie_key_in_url_transparent(self): +        ydl = YDL() + +        class Foo1IE(InfoExtractor): +            _VALID_URL = r'foo1:' + +            def _real_extract(self, url): +                return { +                    '_type': 'url_transparent', +                    'url': 'foo2:', +                    'ie_key': 'Foo2', +                } + +        class Foo2IE(InfoExtractor): +            _VALID_URL = r'foo2:' + +            def _real_extract(self, url): +                return { +                    '_type': 'url', +                    'url': 'foo3:', +                    'ie_key': 'Foo3', +                } + +        class Foo3IE(InfoExtractor): +            _VALID_URL = r'foo3:' + +            def _real_extract(self, url): +                return _make_result([{'url': TEST_URL}]) + +        ydl.add_info_extractor(Foo1IE(ydl)) +        ydl.add_info_extractor(Foo2IE(ydl)) +        ydl.add_info_extractor(Foo3IE(ydl)) +        ydl.extract_info('foo1:') +        downloaded = ydl.downloaded_info_dicts[0] +        self.assertEqual(downloaded['url'], TEST_URL) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a0c11e6c1..f5af184e6 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -56,7 +56,7 @@ class TestAllURLsMatching(unittest.TestCase):          assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')      def test_youtube_user_matching(self): -        self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user']) +        self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])      def test_youtube_feeds(self):          self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 9ed9fe622..9a695c4e8 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -21,7 +21,7 @@ from youtube_dl.extractor import (      NPOIE,      ComedyCentralIE,      NRKTVIE, -    RaiIE, +    RaiTVIE,      VikiIE,      ThePlatformIE,      ThePlatformFeedIE, @@ -260,7 +260,7 @@ class TestNRKSubtitles(BaseTestSubtitles):  class TestRaiSubtitles(BaseTestSubtitles):      url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' -    IE = RaiIE +    IE = RaiTVIE      def test_allsubtitles(self):          self.DL.params['writesubtitles'] = True diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 26aadb34f..47df0f348 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -34,7 +34,7 @@ class TestYoutubeLists(unittest.TestCase):          ie = YoutubePlaylistIE(dl)          # TODO find a > 100 (paginating?) videos course          result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') -        entries = result['entries'] +        entries = list(result['entries'])          self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs')          self.assertEqual(len(entries), 25)          self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e1bd40843..50228bb32 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -263,7 +263,7 @@ class YoutubeDL(object):      the downloader (see youtube_dl/downloader/common.py):      nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,      noresizebuffer, retries, continuedl, noprogress, consoletitle, -    xattr_set_filesize, external_downloader_args. +    xattr_set_filesize, external_downloader_args, hls_use_mpegts.      The following options are used by the post processors:      prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available, @@ -707,7 +707,6 @@ class YoutubeDL(object):          It will also download the videos if 'download'.          Returns the resolved ie_result.          """ -          result_type = ie_result.get('_type', 'video')          if result_type in ('url', 'url_transparent'): @@ -736,7 +735,7 @@ class YoutubeDL(object):              force_properties = dict(                  (k, v) for k, v in ie_result.items() if v is not None) -            for f in ('_type', 'url'): +            for f in ('_type', 'url', 'ie_key'):                  if f in force_properties:                      del force_properties[f]              new_result = info.copy() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9f131f5db..f5f064241 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -369,6 +369,7 @@ def _real_main(argv=None):          'no_color': opts.no_color,          'ffmpeg_location': opts.ffmpeg_location,          'hls_prefer_native': opts.hls_prefer_native, +        'hls_use_mpegts': opts.hls_use_mpegts,          'external_downloader_args': external_downloader_args,          'postprocessor_args': postprocessor_args,          'cn_verification_proxy': opts.cn_verification_proxy, diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index fc7521598..de815612c 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -45,6 +45,7 @@ class FileDownloader(object):                          (experimental)      external_downloader_args:  A list of additional command-line arguments for the                          external downloader. +    hls_use_mpegts:     Use the mpegts container for HLS videos.      Subclasses of this one must re-define the real_download method.      """ diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index aaf0c49c8..fc9642905 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -273,15 +273,21 @@ class F4mFD(FragmentFD):          return fragments_list      def _parse_bootstrap_node(self, node, base_url): -        if node.text is None: +        # Sometimes non empty inline bootstrap info can be specified along +        # with bootstrap url attribute (e.g. dummy inline bootstrap info +        # contains whitespace characters in [1]). We will prefer bootstrap +        # url over inline bootstrap info when present. +        # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m +        bootstrap_url = node.get('url') +        if bootstrap_url:              bootstrap_url = compat_urlparse.urljoin( -                base_url, node.attrib['url']) +                base_url, bootstrap_url)              boot_info = self._get_bootstrap_from_url(bootstrap_url)          else:              bootstrap_url = None              bootstrap = base64.b64decode(node.text.encode('ascii'))              boot_info = read_bootstrap_info(bootstrap) -        return (boot_info, bootstrap_url) +        return boot_info, bootstrap_url      def real_download(self, filename, info_dict):          man_url = info_dict['url'] @@ -316,7 +322,8 @@ class F4mFD(FragmentFD):              metadata = None          fragments_list = build_fragments_list(boot_info) -        if self.params.get('test', False): +        test = self.params.get('test', False) +        if test:              # We only download the first fragment              fragments_list = fragments_list[:1]          total_frags = len(fragments_list) @@ -326,6 +333,7 @@ class F4mFD(FragmentFD):          ctx = {              'filename': filename,              'total_frags': total_frags, +            'live': live,          }          self._prepare_frag_download(ctx) @@ -380,7 +388,7 @@ class F4mFD(FragmentFD):                  else:                      raise -            if not fragments_list and live and bootstrap_url: +            if not fragments_list and not test and live and bootstrap_url:                  fragments_list = self._update_live_fragments(bootstrap_url, frag_i)                  total_frags += len(fragments_list)                  if fragments_list and (fragments_list[0][1] > frag_i + 1): diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 0c9113d0f..8b96eceb9 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -26,7 +26,11 @@ class FragmentFD(FileDownloader):          self._start_frag_download(ctx)      def _prepare_frag_download(self, ctx): -        self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags'])) +        if 'live' not in ctx: +            ctx['live'] = False +        self.to_screen( +            '[%s] Total fragments: %s' +            % (self.FD_NAME, ctx['total_frags'] if not ctx['live'] else 'unknown (live)'))          self.report_destination(ctx['filename'])          dl = HttpQuietDownloader(              self.ydl, @@ -74,14 +78,14 @@ class FragmentFD(FileDownloader):              if s['status'] not in ('downloading', 'finished'):                  return -            frag_total_bytes = s.get('total_bytes') or 0 - -            estimated_size = ( -                (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / -                (state['frag_index'] + 1) * total_frags)              time_now = time.time() -            state['total_bytes_estimate'] = estimated_size              state['elapsed'] = time_now - start +            frag_total_bytes = s.get('total_bytes') or 0 +            if not ctx['live']: +                estimated_size = ( +                    (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / +                    (state['frag_index'] + 1) * total_frags) +                state['total_bytes_estimate'] = estimated_size              if s['status'] == 'finished':                  state['frag_index'] += 1 @@ -91,9 +95,10 @@ class FragmentFD(FileDownloader):              else:                  frag_downloaded_bytes = s['downloaded_bytes']                  state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] -                state['eta'] = self.calc_eta( -                    start, time_now, estimated_size, -                    state['downloaded_bytes']) +                if not ctx['live']: +                    state['eta'] = self.calc_eta( +                        start, time_now, estimated_size, +                        state['downloaded_bytes'])                  state['speed'] = s.get('speed')                  ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes              self._hook_progress(state) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 10b83c6b2..cb34dc4ab 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -39,7 +39,11 @@ class HlsFD(FileDownloader):                  '-headers',                  ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] -        args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] +        args += ['-i', url, '-c', 'copy'] +        if self.params.get('hls_use_mpegts', False): +            args += ['-f', 'mpegts'] +        else: +            args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc']          args = [encodeArgument(opt) for opt in args]          args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 532be7e4c..e61a88de7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -50,7 +50,7 @@ from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE  from .audimedia import AudiMediaIE  from .audiomack import AudiomackIE, AudiomackAlbumIE -from .azubu import AzubuIE +from .azubu import AzubuIE, AzubuLiveIE  from .baidu import BaiduVideoIE  from .bambuser import BambuserIE, BambuserChannelIE  from .bandcamp import BandcampIE, BandcampAlbumIE @@ -142,6 +142,8 @@ from .dailymotion import (  from .daum import (      DaumIE,      DaumClipIE, +    DaumPlaylistIE, +    DaumUserIE,  )  from .dbtv import DBTVIE  from .dcn import ( @@ -372,6 +374,7 @@ from .macgamestore import MacGameStoreIE  from .mailru import MailRuIE  from .makertv import MakerTVIE  from .malemotion import MalemotionIE +from .matchtv import MatchTVIE  from .mdr import MDRIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE @@ -482,6 +485,7 @@ from .npo import (      NPOLiveIE,      NPORadioIE,      NPORadioFragmentIE, +    SchoolTVIE,      VPROIE,      WNLIE  ) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index be7913bc7..92eee8119 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,11 +8,7 @@ from ..compat import compat_str  from ..utils import int_or_none -class ACastBaseIE(InfoExtractor): -    _API_BASE_URL = 'https://www.acast.com/api/' - - -class ACastIE(ACastBaseIE): +class ACastIE(InfoExtractor):      IE_NAME = 'acast'      _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<channel>[^/]+)/(?P<id>[^/#?]+)'      _TEST = { @@ -23,14 +19,19 @@ class ACastIE(ACastBaseIE):              'ext': 'mp3',              'title': '"Where Are You?": Taipei 101, Taiwan',              'timestamp': 1196172000000, -            'description': 'md5:0c5d8201dfea2b93218ea986c91eee6e', +            'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e',              'duration': 211,          }      }      def _real_extract(self, url):          channel, display_id = re.match(self._VALID_URL, url).groups() -        cast_data = self._download_json(self._API_BASE_URL + 'channels/%s/acasts/%s/playback' % (channel, display_id), display_id) + +        embed_page = self._download_webpage( +            re.sub('(?:www\.)?acast\.com', 'embedcdn.acast.com', url), display_id) +        cast_data = self._parse_json(self._search_regex( +            r'window\[\'acast/queries\'\]\s*=\s*([^;]+);', embed_page, 'acast data'), +            display_id)['GetAcast/%s/%s' % (channel, display_id)]          return {              'id': compat_str(cast_data['id']), @@ -44,7 +45,7 @@ class ACastIE(ACastBaseIE):          } -class ACastChannelIE(ACastBaseIE): +class ACastChannelIE(InfoExtractor):      IE_NAME = 'acast:channel'      _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<id>[^/#?]+)'      _TEST = { @@ -56,6 +57,7 @@ class ACastChannelIE(ACastBaseIE):          },          'playlist_mincount': 20,      } +    _API_BASE_URL = 'https://www.acast.com/api/'      @classmethod      def suitable(cls, url): diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 7d65b8193..190bc2cc8 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -8,6 +8,8 @@ from .common import InfoExtractor  from ..compat import compat_str  from ..utils import (      qualities, +    unescapeHTML, +    xpath_element,  ) @@ -31,7 +33,7 @@ class AllocineIE(InfoExtractor):              'id': '19540403',              'ext': 'mp4',              'title': 'Planes 2 Bande-annonce VF', -            'description': 'md5:eeaffe7c2d634525e21159b93acf3b1e', +            'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway',              'thumbnail': 're:http://.*\.jpg',          },      }, { @@ -41,7 +43,7 @@ class AllocineIE(InfoExtractor):              'id': '19544709',              'ext': 'mp4',              'title': 'Dragons 2 - Bande annonce finale VF', -            'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac', +            'description': 'md5:601d15393ac40f249648ef000720e7e3',              'thumbnail': 're:http://.*\.jpg',          },      }, { @@ -59,14 +61,18 @@ class AllocineIE(InfoExtractor):          if typ == 'film':              video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id')          else: -            player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player') - -            player_data = json.loads(player) -            video_id = compat_str(player_data['refMedia']) +            player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player', default=None) +            if player: +                player_data = json.loads(player) +                video_id = compat_str(player_data['refMedia']) +            else: +                model = self._search_regex(r'data-model="([^"]+)">', webpage, 'data model') +                model_data = self._parse_json(unescapeHTML(model), display_id) +                video_id = compat_str(model_data['id'])          xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) -        video = xml.find('.//AcVisionVideo').attrib +        video = xpath_element(xml, './/AcVisionVideo').attrib          quality = qualities(['ld', 'md', 'hd'])          formats = [] diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 0961d339f..011edf128 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals  import json  from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( +    ExtractorError, +    float_or_none, +    sanitized_Request, +)  class AzubuIE(InfoExtractor): @@ -91,3 +95,37 @@ class AzubuIE(InfoExtractor):              'view_count': view_count,              'formats': formats,          } + + +class AzubuLiveIE(InfoExtractor): +    _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$' + +    _TEST = { +        'url': 'http://www.azubu.tv/MarsTVMDLen', +        'only_matching': True, +    } + +    def _real_extract(self, url): +        user = self._match_id(url) + +        info = self._download_json( +            'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user), +            user)['data'] +        if info['type'] != 'STREAM': +            raise ExtractorError('{0} is not streaming live'.format(user), expected=True) + +        req = sanitized_Request( +            'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id']) +        req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV') +        bc_info = self._download_json(req, user) +        m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') +        formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') + +        return { +            'id': info['id'], +            'title': self._live_title(info['title']), +            'uploader_id': user, +            'formats': formats, +            'is_live': True, +            'thumbnail': bc_info['poster'], +        } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 210ce568b..6ddee686c 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -194,6 +194,19 @@ class BBCCoUkIE(InfoExtractor):                  'skip_download': True,              },          }, { +            # compact player (https://github.com/rg3/youtube-dl/issues/8147) +            'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', +            'info_dict': { +                'id': 'p028bfkj', +                'ext': 'flv', +                'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', +                'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', +            }, +            'params': { +                # rtmp download +                'skip_download': True, +            }, +        }, {              'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',              'only_matching': True,          }, { @@ -485,7 +498,8 @@ class BBCCoUkIE(InfoExtractor):                  (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',                   r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')              description = self._search_regex( -                r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', +                (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', +                 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),                  webpage, 'description', default=None)              if not description:                  description = self._html_search_meta('description', webpage) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 510813f76..c28e72927 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -1,7 +1,13 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..utils import ( +    js_to_json, +    determine_ext, +)  class BpbIE(InfoExtractor): @@ -10,7 +16,8 @@ class BpbIE(InfoExtractor):      _TEST = {          'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', -        'md5': '0792086e8e2bfbac9cdf27835d5f2093', +        # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 +        'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f',          'info_dict': {              'id': '297',              'ext': 'mp4', @@ -25,13 +32,26 @@ class BpbIE(InfoExtractor):          title = self._html_search_regex(              r'<h2 class="white">(.*?)</h2>', webpage, 'title') -        video_url = self._html_search_regex( -            r'(http://film\.bpb\.de/player/dokument_[0-9]+\.mp4)', -            webpage, 'video URL') +        video_info_dicts = re.findall( +            r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + +        formats = [] +        for video_info in video_info_dicts: +            video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) +            quality = video_info['quality'] +            video_url = video_info['src'] +            formats.append({ +                'url': video_url, +                'preference': 10 if quality == 'high' else 0, +                'format_note': quality, +                'format_id': '%s-%s' % (quality, determine_ext(video_url)), +            }) + +        self._sort_formats(formats)          return {              'id': video_id, -            'url': video_url, +            'formats': formats,              'title': title,              'description': self._og_search_description(webpage),          } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 33290fd74..199a04d1c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -828,7 +828,7 @@ class InfoExtractor(object):          for f in formats:              # Automatically determine tbr when missing based on abr and vbr (improves              # formats sorting in some cases) -            if 'tbr' not in f and 'abr' in f and 'vbr' in f: +            if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:                  f['tbr'] = f['abr'] + f['vbr']          def _formats_key(f): @@ -1330,6 +1330,83 @@ class InfoExtractor(object):              })          return entries +    def _download_dash_manifest(self, dash_manifest_url, video_id, fatal=True): +        return self._download_xml( +            dash_manifest_url, video_id, +            note='Downloading DASH manifest', +            errnote='Could not download DASH manifest', +            fatal=fatal) + +    def _extract_dash_manifest_formats(self, dash_manifest_url, video_id, fatal=True, namespace=None, formats_dict={}): +        dash_doc = self._download_dash_manifest(dash_manifest_url, video_id, fatal) +        if dash_doc is False: +            return [] + +        return self._parse_dash_manifest( +            dash_doc, namespace=namespace, formats_dict=formats_dict) + +    def _parse_dash_manifest(self, dash_doc, namespace=None, formats_dict={}): +        def _add_ns(path): +            return self._xpath_ns(path, namespace) + +        formats = [] +        for a in dash_doc.findall('.//' + _add_ns('AdaptationSet')): +            mime_type = a.attrib.get('mimeType') +            for r in a.findall(_add_ns('Representation')): +                mime_type = r.attrib.get('mimeType') or mime_type +                url_el = r.find(_add_ns('BaseURL')) +                if mime_type == 'text/vtt': +                    # TODO implement WebVTT downloading +                    pass +                elif mime_type.startswith('audio/') or mime_type.startswith('video/'): +                    segment_list = r.find(_add_ns('SegmentList')) +                    format_id = r.attrib['id'] +                    video_url = url_el.text if url_el is not None else None +                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) +                    f = { +                        'format_id': format_id, +                        'url': video_url, +                        'width': int_or_none(r.attrib.get('width')), +                        'height': int_or_none(r.attrib.get('height')), +                        'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), +                        'asr': int_or_none(r.attrib.get('audioSamplingRate')), +                        'filesize': filesize, +                        'fps': int_or_none(r.attrib.get('frameRate')), +                    } +                    if segment_list is not None: +                        initialization_url = segment_list.find(_add_ns('Initialization')).attrib['sourceURL'] +                        f.update({ +                            'initialization_url': initialization_url, +                            'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall(_add_ns('SegmentURL'))], +                            'protocol': 'http_dash_segments', +                        }) +                        if not f.get('url'): +                            f['url'] = initialization_url +                    try: +                        existing_format = next( +                            fo for fo in formats +                            if fo['format_id'] == format_id) +                    except StopIteration: +                        full_info = formats_dict.get(format_id, {}).copy() +                        full_info.update(f) +                        codecs = r.attrib.get('codecs') +                        if codecs: +                            if mime_type.startswith('video/'): +                                vcodec, acodec = codecs, 'none' +                            else:  # mime_type.startswith('audio/') +                                vcodec, acodec = 'none', codecs + +                            full_info.update({ +                                'vcodec': vcodec, +                                'acodec': acodec, +                            }) +                        formats.append(full_info) +                    else: +                        existing_format.update(f) +                else: +                    self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) +        return formats +      def _live_title(self, name):          """ Generate the title for a live video """          now = datetime.datetime.now() diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b78edf729..b8b9d058d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -113,7 +113,7 @@ class CSpanIE(InfoExtractor):                      'tbr': int_or_none(get_text_attr(quality, 'bitrate')),                  })              if not formats: -                path = get_text_attr(f, 'path') +                path = unescapeHTML(get_text_attr(f, 'path'))                  if not path:                      continue                  formats = self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index f08f57157..c84c51058 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -2,17 +2,26 @@  from __future__ import unicode_literals +import re +import itertools +  from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( +    compat_parse_qs, +    compat_urllib_parse, +    compat_urllib_parse_unquote, +    compat_urlparse, +)  from ..utils import (      int_or_none,      str_to_int,      xpath_text, +    unescapeHTML,  )  class DaumIE(InfoExtractor): -    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/v/(?P<id>[^?#&]+)' +    _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P<id>[^?#&]+)'      IE_NAME = 'daum.net'      _TESTS = [{ @@ -23,25 +32,57 @@ class DaumIE(InfoExtractor):              'title': '마크 헌트 vs 안토니오 실바',              'description': 'Mark Hunt vs Antonio Silva',              'upload_date': '20131217', +            'thumbnail': 're:^https?://.*\.(?:jpg|png)',              'duration': 2117,              'view_count': int,              'comment_count': int,          },      }, { +        'url': 'http://m.tvpot.daum.net/v/65139429', +        'info_dict': { +            'id': '65139429', +            'ext': 'mp4', +            'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', +            'description': 'md5:79794514261164ff27e36a21ad229fc5', +            'upload_date': '20150604', +            'thumbnail': 're:^https?://.*\.(?:jpg|png)', +            'duration': 154, +            'view_count': int, +            'comment_count': int, +        }, +    }, {          'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24',          'only_matching': True, +    }, { +        'url': 'http://videofarm.daum.net/controller/player/VodPlayer.swf?vid=vwIpVpCQsT8%24&ref=', +        'info_dict': { +            'id': 'vwIpVpCQsT8$', +            'ext': 'flv', +            'title': '01-Korean War ( Trouble on the horizon )', +            'description': '\nKorean War 01\nTrouble on the horizon\n전쟁의 먹구름', +            'upload_date': '20080223', +            'thumbnail': 're:^https?://.*\.(?:jpg|png)', +            'duration': 249, +            'view_count': int, +            'comment_count': int, +        },      }]      def _real_extract(self, url): -        video_id = self._match_id(url) +        video_id = compat_urllib_parse_unquote(self._match_id(url))          query = compat_urllib_parse.urlencode({'vid': video_id}) -        info = self._download_xml( -            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, -            'Downloading video info')          movie_data = self._download_json(              'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query,              video_id, 'Downloading video formats info') +        # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid +        if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): +            return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) + +        info = self._download_xml( +            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, +            'Downloading video info') +          formats = []          for format_el in movie_data['output_list']['output_list']:              profile = format_el['profile'] @@ -76,8 +117,9 @@ class DaumIE(InfoExtractor):  class DaumClipIE(InfoExtractor): -    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.do|mypot/View.do)\?.*?clipid=(?P<id>\d+)' +    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P<id>\d+)'      IE_NAME = 'daum.net:clip' +    _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s'      _TESTS = [{          'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', @@ -87,11 +129,19 @@ class DaumClipIE(InfoExtractor):              'title': 'DOTA 2GETHER 시즌2 6회 - 2부',              'description': 'DOTA 2GETHER 시즌2 6회 - 2부',              'upload_date': '20130831', +            'thumbnail': 're:^https?://.*\.(?:jpg|png)',              'duration': 3868,              'view_count': int,          }, +    }, { +        'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425', +        'only_matching': True,      }] +    @classmethod +    def suitable(cls, url): +        return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super(DaumClipIE, cls).suitable(url) +      def _real_extract(self, url):          video_id = self._match_id(url)          clip_info = self._download_json( @@ -102,7 +152,7 @@ class DaumClipIE(InfoExtractor):              '_type': 'url_transparent',              'id': video_id,              'url': 'http://tvpot.daum.net/v/%s' % clip_info['vid'], -            'title': clip_info['title'], +            'title': unescapeHTML(clip_info['title']),              'thumbnail': clip_info.get('thumb_url'),              'description': clip_info.get('contents'),              'duration': int_or_none(clip_info.get('duration')), @@ -110,3 +160,139 @@ class DaumClipIE(InfoExtractor):              'view_count': int_or_none(clip_info.get('play_count')),              'ie_key': 'Daum',          } + + +class DaumListIE(InfoExtractor): +    def _get_entries(self, list_id, list_id_type): +        name = None +        entries = [] +        for pagenum in itertools.count(1): +            list_info = self._download_json( +                'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page=%d&%s=%s' % ( +                    pagenum, list_id_type, list_id), list_id, 'Downloading list info - %s' % pagenum) + +            entries.extend([ +                self.url_result( +                    'http://tvpot.daum.net/v/%s' % clip['vid']) +                for clip in list_info['clip_list'] +            ]) + +            if not name: +                name = list_info.get('playlist_bean', {}).get('name') or \ +                    list_info.get('potInfo', {}).get('name') + +            if not list_info.get('has_more'): +                break + +        return name, entries + +    def _check_clip(self, url, list_id): +        query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query) +        if 'clipid' in query_dict: +            clip_id = query_dict['clipid'][0] +            if self._downloader.params.get('noplaylist'): +                self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) +                return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') +            else: +                self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) + + +class DaumPlaylistIE(DaumListIE): +    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View\.do|Top\.tv)\?.*?playlistid=(?P<id>[0-9]+)' +    IE_NAME = 'daum.net:playlist' +    _URL_TEMPLATE = 'http://tvpot.daum.net/mypot/View.do?playlistid=%s' + +    _TESTS = [{ +        'note': 'Playlist url with clipid', +        'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', +        'info_dict': { +            'id': '6213966', +            'title': 'Woorissica Official', +        }, +        'playlist_mincount': 181 +    }, { +        'note': 'Playlist url with clipid - noplaylist', +        'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', +        'info_dict': { +            'id': '73806844', +            'ext': 'mp4', +            'title': '151017 Airport', +            'upload_date': '20160117', +        }, +        'params': { +            'noplaylist': True, +            'skip_download': True, +        } +    }] + +    @classmethod +    def suitable(cls, url): +        return False if DaumUserIE.suitable(url) else super(DaumPlaylistIE, cls).suitable(url) + +    def _real_extract(self, url): +        list_id = self._match_id(url) + +        clip_result = self._check_clip(url, list_id) +        if clip_result: +            return clip_result + +        name, entries = self._get_entries(list_id, 'playlistid') + +        return self.playlist_result(entries, list_id, name) + + +class DaumUserIE(DaumListIE): +    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View|Top)\.(?:do|tv)\?.*?ownerid=(?P<id>[0-9a-zA-Z]+)' +    IE_NAME = 'daum.net:user' + +    _TESTS = [{ +        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0', +        'info_dict': { +            'id': 'o2scDLIVbHc0', +            'title': '마이 리틀 텔레비전', +        }, +        'playlist_mincount': 213 +    }, { +        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&clipid=73801156', +        'info_dict': { +            'id': '73801156', +            'ext': 'mp4', +            'title': '[미공개] 김구라, 오만석이 부릅니다 \'오케피\' - 마이 리틀 텔레비전 20160116', +            'upload_date': '20160117', +            'description': 'md5:5e91d2d6747f53575badd24bd62b9f36' +        }, +        'params': { +            'noplaylist': True, +            'skip_download': True, +        } +    }, { +        'note': 'Playlist url has ownerid and playlistid, playlistid takes precedence', +        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&playlistid=6196631', +        'info_dict': { +            'id': '6196631', +            'title': '마이 리틀 텔레비전 - 20160109', +        }, +        'playlist_count': 11 +    }, { +        'url': 'http://tvpot.daum.net/mypot/Top.do?ownerid=o2scDLIVbHc0', +        'only_matching': True, +    }, { +        'url': 'http://m.tvpot.daum.net/mypot/Top.tv?ownerid=45x1okb1If50&playlistid=3569733', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        list_id = self._match_id(url) + +        clip_result = self._check_clip(url, list_id) +        if clip_result: +            return clip_result + +        query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query) +        if 'playlistid' in query_dict: +            playlist_id = query_dict['playlistid'][0] +            return self.url_result(DaumPlaylistIE._URL_TEMPLATE % playlist_id, 'DaumPlaylist') + +        name, entries = self._get_entries(list_id, 'ownerid') + +        return self.playlist_result(entries, list_id, name) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 3762d8748..db4b263bc 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -53,8 +53,8 @@ class ESPNIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          video_id = self._search_regex( -            r'class="video-play-button"[^>]+data-id="(\d+)', -            webpage, 'video id') +            r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P<id>\d+)', +            webpage, 'video id', group='id')          cms = 'espn'          if 'data-source="intl"' in webpage: diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cb5dd57fb..b6d1180f0 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -6,9 +6,11 @@ import socket  from .common import InfoExtractor  from ..compat import ( +    compat_etree_fromstring,      compat_http_client,      compat_urllib_error,      compat_urllib_parse_unquote, +    compat_urllib_parse_unquote_plus,  )  from ..utils import (      error_to_compat_str, @@ -44,6 +46,9 @@ class FacebookIE(InfoExtractor):      _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'      _NETRC_MACHINE = 'facebook'      IE_NAME = 'facebook' + +    _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' +      _TESTS = [{          'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',          'md5': '6a40d33c0eccbb1af76cf0485a052659', @@ -66,6 +71,16 @@ class FacebookIE(InfoExtractor):              'title'          ]      }, { +        'note': 'Video with DASH manifest', +        'url': 'https://www.facebook.com/video.php?v=957955867617029', +        'md5': '54706e4db4f5ad58fbad82dde1f1213f', +        'info_dict': { +            'id': '957955867617029', +            'ext': 'mp4', +            'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', +            'uploader': 'Demy de Zeeuw', +        }, +    }, {          'url': 'https://www.facebook.com/video.php?v=10204634152394104',          'only_matching': True,      }, { @@ -147,13 +162,36 @@ class FacebookIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        url = 'https://www.facebook.com/video/video.php?v=%s' % video_id -        webpage = self._download_webpage(url, video_id) +        req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id) +        req.add_header('User-Agent', self._CHROME_USER_AGENT) +        webpage = self._download_webpage(req, video_id) + +        video_data = None          BEFORE = '{swf.addParam(param[0], param[1]);});\n'          AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'          m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) -        if not m: +        if m: +            data = dict(json.loads(m.group(1))) +            params_raw = compat_urllib_parse_unquote(data['params']) +            video_data = json.loads(params_raw)['video_data'] + +        def video_data_list2dict(video_data): +            ret = {} +            for item in video_data: +                format_id = item['stream_type'] +                ret.setdefault(format_id, []).append(item) +            return ret + +        if not video_data: +            server_js_data = self._parse_json(self._search_regex( +                r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) +            for item in server_js_data['instances']: +                if item[1][0] == 'VideoConfig': +                    video_data = video_data_list2dict(item[2][0]['videoData']) +                    break + +        if not video_data:              m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)              if m_msg is not None:                  raise ExtractorError( @@ -161,12 +199,9 @@ class FacebookIE(InfoExtractor):                      expected=True)              else:                  raise ExtractorError('Cannot parse data') -        data = dict(json.loads(m.group(1))) -        params_raw = compat_urllib_parse_unquote(data['params']) -        params = json.loads(params_raw)          formats = [] -        for format_id, f in params['video_data'].items(): +        for format_id, f in video_data.items():              if not f or not isinstance(f, list):                  continue              for quality in ('sd', 'hd'): @@ -178,9 +213,16 @@ class FacebookIE(InfoExtractor):                              'url': src,                              'preference': -10 if format_id == 'progressive' else 0,                          }) +            dash_manifest = f[0].get('dash_manifest') +            if dash_manifest: +                formats.extend(self._parse_dash_manifest( +                    compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)), +                    namespace='urn:mpeg:dash:schema:mpd:2011'))          if not formats:              raise ExtractorError('Cannot find video formats') +        self._sort_formats(formats) +          video_title = self._html_search_regex(              r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',              default=None) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 027f55eb2..f6b9046f9 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -6,24 +6,29 @@ from ..utils import (      xpath_text,      xpath_with_ns,  ) +from .youtube import YoutubeIE  class GamekingsIE(InfoExtractor): -    _VALID_URL = r'http://www\.gamekings\.tv/(?:videos|nieuws)/(?P<id>[^/]+)' +    _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)'      _TESTS = [{ -        'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/', -        # MD5 is flaky, seems to change regularly -        # 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3', +        # YouTube embed video +        'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', +        'md5': '5208d3a17adeaef829a7861887cb9029',          'info_dict': { -            'id': 'phoenix-wright-ace-attorney-dual-destinies-review', +            'id': 'HkSQKetlGOU',              'ext': 'mp4', -            'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review', -            'description': 'md5:36fd701e57e8c15ac8682a2374c99731', +            'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', +            'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d',              'thumbnail': 're:^https?://.*\.jpg$', +            'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', +            'uploader': 'Gamekings Vault', +            'upload_date': '20151123',          }, +        'add_ie': ['Youtube'],      }, {          # vimeo video -        'url': 'http://www.gamekings.tv/videos/the-legend-of-zelda-majoras-mask/', +        'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/',          'md5': '12bf04dfd238e70058046937657ea68d',          'info_dict': {              'id': 'the-legend-of-zelda-majoras-mask', @@ -33,7 +38,7 @@ class GamekingsIE(InfoExtractor):              'thumbnail': 're:^https?://.*\.jpg$',          },      }, { -        'url': 'http://www.gamekings.tv/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', +        'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/',          'only_matching': True,      }] @@ -43,7 +48,11 @@ class GamekingsIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          playlist_id = self._search_regex( -            r'gogoVideo\(\s*\d+\s*,\s*"([^"]+)', webpage, 'playlist id') +            r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') + +        # Check if a YouTube embed is used +        if YoutubeIE.suitable(playlist_id): +            return self.url_result(playlist_id, ie='Youtube')          playlist = self._download_xml(              'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 26d3698c8..b18e734c4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1819,6 +1819,17 @@ class GenericIE(InfoExtractor):          if digiteka_url:              return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) +        # Look for Limelight embeds +        mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) +        if mobj: +            lm = { +                'Media': 'media', +                'Channel': 'channel', +                'ChannelList': 'channel_list', +            } +            return self.url_result('limelight:%s:%s' % ( +                lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2)) +          # Look for AdobeTVVideo embeds          mobj = re.search(              r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 1d391e69f..9f1ade2e4 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -2,12 +2,13 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import smuggle_url  class KickStarterIE(InfoExtractor):      _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*'      _TESTS = [{ -        'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location', +        'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant/description',          'md5': 'c81addca81327ffa66c642b5d8b08cab',          'info_dict': {              'id': '1404461844', @@ -27,7 +28,8 @@ class KickStarterIE(InfoExtractor):              'uploader_id': 'pebble',              'uploader': 'Pebble Technology',              'title': 'Pebble iOS Notifications', -        } +        }, +        'add_ie': ['Vimeo'],      }, {          'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',          'info_dict': { @@ -43,7 +45,7 @@ class KickStarterIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          title = self._html_search_regex( -            r'<title>\s*(.*?)(?:\s*— Kickstarter)?\s*</title>', +            r'<title>\s*(.*?)(?:\s*—\s*Kickstarter)?\s*</title>',              webpage, 'title')          video_url = self._search_regex(              r'data-video-url="(.*?)"', @@ -52,7 +54,7 @@ class KickStarterIE(InfoExtractor):              return {                  '_type': 'url_transparent',                  'ie_key': 'Generic', -                'url': url, +                'url': smuggle_url(url, {'to_generic': True}),                  'title': title,              } diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 08bdae8a2..9665ece89 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -5,11 +5,13 @@ import datetime  import re  import time  import base64 +import hashlib  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse,      compat_ord, +    compat_str,  )  from ..utils import (      determine_ext, @@ -258,6 +260,7 @@ class LetvCloudIE(InfoExtractor):          },      }, {          'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', +        'md5': 'e03d9cc8d9c13191e1caf277e42dbd31',          'info_dict': {              'id': 'p7jnfw5hw9_ec93197892',              'ext': 'mp4', @@ -265,6 +268,7 @@ class LetvCloudIE(InfoExtractor):          },      }, {          'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', +        'md5': 'cb988699a776b22d4a41b9d43acfb3ac',          'info_dict': {              'id': 'p7jnfw5hw9_187060b6fd',              'ext': 'mp4', @@ -272,21 +276,37 @@ class LetvCloudIE(InfoExtractor):          },      }] -    def _real_extract(self, url): -        uu_mobj = re.search('uu=([\w]+)', url) -        vu_mobj = re.search('vu=([\w]+)', url) - -        if not uu_mobj or not vu_mobj: -            raise ExtractorError('Invalid URL: %s' % url, expected=True) - -        uu = uu_mobj.group(1) -        vu = vu_mobj.group(1) -        media_id = uu + '_' + vu - -        play_json_req = sanitized_Request( -            'http://api.letvcloud.com/gpc.php?cf=html5&sign=signxxxxx&ver=2.2&format=json&' + -            'uu=' + uu + '&vu=' + vu) -        play_json = self._download_json(play_json_req, media_id, 'Downloading playJson data') +    @staticmethod +    def sign_data(obj): +        if obj['cf'] == 'flash': +            salt = '2f9d6924b33a165a6d8b5d3d42f4f987' +            items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] +        elif obj['cf'] == 'html5': +            salt = 'fbeh5player12c43eccf2bec3300344' +            items = ['cf', 'ran', 'uu', 'bver', 'vu'] +        input_data = ''.join([item + obj[item] for item in items]) + salt +        obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() + +    def _get_formats(self, cf, uu, vu, media_id): +        def get_play_json(cf, timestamp): +            data = { +                'cf': cf, +                'ver': '2.2', +                'bver': 'firefox44.0', +                'format': 'json', +                'uu': uu, +                'vu': vu, +                'ran': compat_str(timestamp), +            } +            self.sign_data(data) +            return self._download_json( +                'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), +                media_id, 'Downloading playJson data for type %s' % cf) + +        play_json = get_play_json(cf, time.time()) +        # The server time may be different from local time +        if play_json.get('code') == 10071: +            play_json = get_play_json(cf, play_json['timestamp'])          if not play_json.get('data'):              if play_json.get('message'): @@ -312,6 +332,21 @@ class LetvCloudIE(InfoExtractor):                  'width': int_or_none(play_url.get('vwidth')),                  'height': int_or_none(play_url.get('vheight')),              }) + +        return formats + +    def _real_extract(self, url): +        uu_mobj = re.search('uu=([\w]+)', url) +        vu_mobj = re.search('vu=([\w]+)', url) + +        if not uu_mobj or not vu_mobj: +            raise ExtractorError('Invalid URL: %s' % url, expected=True) + +        uu = uu_mobj.group(1) +        vu = vu_mobj.group(1) +        media_id = uu + '_' + vu + +        formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id)          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index fb03dd527..1a0625ac3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -40,7 +40,8 @@ class LimelightBaseIE(InfoExtractor):              if not stream_url:                  continue              if '.f4m' in stream_url: -                formats.extend(self._extract_f4m_formats(stream_url, video_id)) +                formats.extend(self._extract_f4m_formats( +                    stream_url, video_id, fatal=False))              else:                  fmt = {                      'url': stream_url, @@ -72,8 +73,8 @@ class LimelightBaseIE(InfoExtractor):              format_id = mobile_url.get('targetMediaPlatform')              if determine_ext(media_url) == 'm3u8':                  formats.extend(self._extract_m3u8_formats( -                    media_url, video_id, 'mp4', entry_protocol='m3u8_native', -                    preference=-1, m3u8_id=format_id)) +                    media_url, video_id, 'mp4', 'm3u8_native', +                    m3u8_id=format_id, fatal=False))              else:                  formats.append({                      'url': media_url, diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py new file mode 100644 index 000000000..28e0dfe63 --- /dev/null +++ b/youtube_dl/extractor/matchtv.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( +    sanitized_Request, +    xpath_text, +) + + +class MatchTVIE(InfoExtractor): +    _VALID_URL = r'https?://matchtv\.ru/?#live-player' +    _TEST = { +        'url': 'http://matchtv.ru/#live-player', +        'info_dict': { +            'id': 'matchtv-live', +            'ext': 'flv', +            'title': 're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        video_id = 'matchtv-live' +        request = sanitized_Request( +            'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse.urlencode({ +                'ts': '', +                'quality': 'SD', +                'contentId': '561d2c0df7159b37178b4567', +                'sign': '', +                'includeHighlights': '0', +                'userId': '', +                'sessionId': random.randint(1, 1000000000), +                'contentType': 'channel', +                'timeShift': '0', +                'platform': 'portal', +            }), +            headers={ +                'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf', +            }) +        video_url = self._download_json(request, video_id)['data']['videoUrl'] +        f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') +        formats = self._extract_f4m_formats(f4m_url, video_id) +        return { +            'id': video_id, +            'title': self._live_title('Матч ТВ - Прямой эфир'), +            'is_live': True, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 9d26030d3..a071378b6 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -18,13 +18,17 @@ class NBAIE(InfoExtractor):          'md5': '9e7729d3010a9c71506fd1248f74e4f4',          'info_dict': {              'id': '0021200253-okc-bkn-recap', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Thunder vs. Nets',              'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',              'duration': 181,              'timestamp': 1354638466,              'upload_date': '20121204',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',          'only_matching': True, @@ -68,7 +72,7 @@ class NBAIE(InfoExtractor):              if video_url.startswith('/'):                  continue              if video_url.endswith('.m3u8'): -                formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False)) +                formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False))              elif video_url.endswith('.f4m'):                  formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False))              else: diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1dd54c2f1..18d01f423 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -19,32 +19,39 @@ class NBCIE(InfoExtractor):      _TESTS = [          {              'url': 'http://www.nbc.com/the-tonight-show/segments/112966', -            # md5 checksum is not stable              'info_dict': { -                'id': 'c9xnCo0YPOPH', -                'ext': 'flv', +                'id': '112966', +                'ext': 'mp4',                  'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',                  'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://www.nbc.com/the-tonight-show/episodes/176',              'info_dict': { -                'id': 'XwU9KZkp98TH', +                'id': '176',                  'ext': 'flv',                  'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',                  'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',              }, -            'skip': 'Only works from US', +            'skip': '404 Not Found',          },          {              'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',              'info_dict': { -                'id': '8iUuyzWDdYUZ', -                'ext': 'flv', +                'id': '2832821', +                'ext': 'mp4',                  'title': 'Star Wars Teaser',                  'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },              'skip': 'Only works from US',          },          { @@ -66,7 +73,11 @@ class NBCIE(InfoExtractor):              webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))          if theplatform_url.startswith('//'):              theplatform_url = 'http:' + theplatform_url -        return self.url_result(smuggle_url(theplatform_url, {'source_url': url})) +        return { +            '_type': 'url_transparent', +            'url': smuggle_url(theplatform_url, {'source_url': url}), +            'id': video_id, +        }  class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index eb12fb810..87f5675c7 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -189,7 +189,7 @@ class NPOIE(NPOBaseIE):                  if not video_url:                      continue                  if format_id == 'adaptive': -                    formats.extend(self._extract_m3u8_formats(video_url, video_id)) +                    formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))                  else:                      formats.append({                          'url': video_url, @@ -406,6 +406,38 @@ class NPORadioFragmentIE(InfoExtractor):          } +class SchoolTVIE(InfoExtractor): +    IE_NAME = 'schooltv' +    _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)' + +    _TEST = { +        'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', +        'info_dict': { +            'id': 'WO_NTR_429477', +            'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', +            'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', +            'ext': 'mp4', +            'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' +        }, +        'params': { +            # Skip because of m3u8 download +            'skip_download': True +        } +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        video_id = self._search_regex( +            r'data-mid=(["\'])(?P<id>.+?)\1', webpage, 'video_id', group='id') +        return { +            '_type': 'url_transparent', +            'ie_key': 'NPO', +            'url': 'npo:%s' % video_id, +            'display_id': display_id +        } + +  class VPROIE(NPOIE):      IE_NAME = 'vpro'      _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6ff13050d..a126f5054 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -133,26 +133,32 @@ class NRKTVIE(InfoExtractor):      _TESTS = [          {              'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', -            'md5': 'adf2c5454fa2bf032f47a9f8fb351342',              'info_dict': {                  'id': 'MUHH48000314', -                'ext': 'flv', +                'ext': 'mp4',                  'title': '20 spørsmål',                  'description': 'md5:bdea103bc35494c143c6a9acdd84887a',                  'upload_date': '20140523',                  'duration': 1741.52,              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'https://tv.nrk.no/program/mdfp15000514', -            'md5': '383650ece2b25ecec996ad7b5bb2a384',              'info_dict': {                  'id': 'mdfp15000514', -                'ext': 'flv', -                'title': 'Kunnskapskanalen: Grunnlovsjubiléet - Stor ståhei for ingenting', +                'ext': 'mp4', +                'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting',                  'description': 'md5:654c12511f035aed1e42bdf5db3b206a',                  'upload_date': '20140524', -                'duration': 4605.0, +                'duration': 4605.08, +            }, +            'params': { +                # m3u8 download +                'skip_download': True,              },          },          { diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 05f93904c..e5d62a139 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -71,7 +71,7 @@ class ScreenwaveMediaIE(InfoExtractor):          formats = []          for source in sources:              if source['type'] == 'hls': -                formats.extend(self._extract_m3u8_formats(source['file'], video_id)) +                formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4'))              else:                  file_ = source.get('file')                  if not file_: @@ -107,7 +107,11 @@ class TeamFourIE(InfoExtractor):              'upload_date': '20130401',              'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar',              'title': 'A Moment With TFS Episode 4', -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 474ebb49b..990ea0fa8 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -53,17 +53,25 @@ class SenateISVPIE(InfoExtractor):          'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',          'info_dict': {              'id': 'judiciary031715', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Integrated Senate Video Player',              'thumbnail': 're:^https?://.*\.(?:jpg|png)$', -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',          'info_dict': {              'id': 'commerce011514', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Integrated Senate Video Player' -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',          # checksum differs each time diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index fa338b936..1457e524e 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -17,18 +17,21 @@ class TV2IE(InfoExtractor):      _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'      _TEST = {          'url': 'http://www.tv2.no/v/916509/', -        'md5': '9cb9e3410b18b515d71892f27856e9b1',          'info_dict': {              'id': '916509', -            'ext': 'flv', -            'title': 'Se Gryttens hyllest av Steven Gerrard', +            'ext': 'mp4', +            'title': 'Se Frode Gryttens hyllest av Steven Gerrard',              'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',              'timestamp': 1431715610,              'upload_date': '20150515',              'duration': 156.967,              'view_count': int,              'categories': list, -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 86ba70ed9..14e945d49 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -86,10 +86,9 @@ class VGTVIE(XstreamIE):          {              # streamType: wasLive              'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', -            'md5': '458f4841239dab414343b50e5af8869c',              'info_dict': {                  'id': '113063', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'V75 fra Solvalla 30.05.15',                  'description': 'md5:b3743425765355855f88e096acc93231',                  'thumbnail': 're:^https?://.*\.jpg', @@ -98,6 +97,10 @@ class VGTVIE(XstreamIE):                  'upload_date': '20150530',                  'view_count': int,              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 2ba9f31df..7c6e98026 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import smuggle_url  class VidziIE(InfoExtractor): @@ -13,6 +14,11 @@ class VidziIE(InfoExtractor):              'id': 'cghql9yq6emu',              'ext': 'mp4',              'title': 'youtube-dl test video  1\\\\2\'3/4<5\\\\6ä7↭', +            'uploader': 'vidzi.tv', +        }, +        'params': { +            # m3u8 download +            'skip_download': True,          },      } @@ -20,19 +26,14 @@ class VidziIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        video_host = self._html_search_regex( -            r'id=\'vplayer\'><img src="http://(.*?)/i', webpage, -            'video host') -        video_hash = self._html_search_regex( -            r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash') -        ext = self._html_search_regex( -            r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext') -        video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext          title = self._html_search_regex(              r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') +        # Vidzi now uses jwplayer, which can be handled by GenericIE          return { +            '_type': 'url_transparent',              'id': video_id,              'title': title, -            'url': video_url, +            'url': smuggle_url(url, {'to_generic': True}), +            'ie_key': 'Generic',          } diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 525e303d4..315984bf9 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -45,6 +45,10 @@ class ViideaIE(InfoExtractor):              'upload_date': '20130627',              'duration': 565,          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          # video with invalid direct format links (HTTP 403)          'url': 'http://videolectures.net/russir2010_filippova_nlp/', diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 90557fa61..11014865a 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -321,7 +321,7 @@ class VKIE(InfoExtractor):  class VKUserVideosIE(InfoExtractor):      IE_NAME = 'vk:uservideos'      IE_DESC = "VK - User's Videos" -    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$' +    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'      _TEMPLATE_URL = 'https://vk.com/videos'      _TESTS = [{          'url': 'http://vk.com/videos205387401', @@ -333,6 +333,9 @@ class VKUserVideosIE(InfoExtractor):      }, {          'url': 'http://vk.com/videos-77521',          'only_matching': True, +    }, { +        'url': 'http://vk.com/videos-97664626?section=all', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 8bbac54e2..2466410fa 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -34,19 +34,20 @@ class XuiteIE(InfoExtractor):          },      }, {          # Video with only one format -        'url': 'http://vlog.xuite.net/play/TkRZNjhULTM0NDE2MjkuZmx2', -        'md5': 'c45737fc8ac5dc8ac2f92ecbcecf505e', +        'url': 'http://vlog.xuite.net/play/WUxxR2xCLTI1OTI1MDk5LmZsdg==', +        'md5': '21f7b39c009b5a4615b4463df6eb7a46',          'info_dict': { -            'id': '3441629', +            'id': '25925099',              'ext': 'mp4', -            'title': '孫燕姿 - 眼淚成詩', +            'title': 'BigBuckBunny_320x180',              'thumbnail': 're:^https?://.*\.jpg$', -            'duration': 217.399, -            'timestamp': 1299383640, -            'upload_date': '20110306', -            'uploader': 'Valen', -            'uploader_id': '10400126', -            'categories': ['影視娛樂'], +            'duration': 596.458, +            'timestamp': 1454242500, +            'upload_date': '20160131', +            'uploader': 'yan12125', +            'uploader_id': '12158353', +            'categories': ['個人短片'], +            'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4',          },      }, {          # Video with two formats diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index dd724085a..b29baafc4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -114,15 +114,13 @@ class YouPornIE(InfoExtractor):              formats.append(f)          self._sort_formats(formats) -        description = self._html_search_regex( -            r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>', -            webpage, 'description', default=None) +        description = self._og_search_description(webpage, default=None)          thumbnail = self._search_regex(              r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',              webpage, 'thumbnail', fatal=False, group='thumbnail')          uploader = self._html_search_regex( -            r'(?s)<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>', +            r'(?s)<div[^>]+class=["\']videoInfoBy(?:\s+[^"\']+)?["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>',              webpage, 'uploader', fatal=False)          upload_date = unified_strdate(self._html_search_regex(              r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 92b9f3ae4..a7f8c968e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -181,7 +181,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return -class YoutubeEntryListBaseInfoExtractor(InfoExtractor): +class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):      # Extract entries from page with "Load more" button      def _entries(self, page, playlist_id):          more_widget_html = content_html = page @@ -233,7 +233,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):  class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):      def _process_page(self, content): -        for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content): +        for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):              yield self.url_result(                  'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -316,55 +316,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},          # DASH mp4 video -        '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) -        '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, -        '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, -        '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, -        '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, +        '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) +        '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, +        '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, +        '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, +        '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},          # Dash mp4 audio -        '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, -        '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, -        '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, +        '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, +        '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, +        '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},          # Dash webm -        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, -        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, -        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, -        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, -        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, -        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, -        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, -        '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, +        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, +        '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},          # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) -        '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, -        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, -        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, -        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, -        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, +        '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, +        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, +        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, +        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, +        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},          # Dash webm audio -        '171': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, -        '172': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, +        '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, +        '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},          # Dash webm audio with opus inside -        '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, -        '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, -        '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, +        '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, +        '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, +        '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},          # RTMP (unnamed)          '_rtmp': {'protocol': 'rtmp'}, @@ -1035,73 +1035,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id          return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') -    def _parse_dash_manifest( -            self, video_id, dash_manifest_url, player_url, age_gate, fatal=True): -        def decrypt_sig(mobj): -            s = mobj.group(1) -            dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) -            return '/signature/%s' % dec_s -        dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url) -        dash_doc = self._download_xml( -            dash_manifest_url, video_id, -            note='Downloading DASH manifest', -            errnote='Could not download DASH manifest', -            fatal=fatal) - -        if dash_doc is False: -            return [] - -        formats = [] -        for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'): -            mime_type = a.attrib.get('mimeType') -            for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'): -                url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') -                if url_el is None: -                    continue -                if mime_type == 'text/vtt': -                    # TODO implement WebVTT downloading -                    pass -                elif mime_type.startswith('audio/') or mime_type.startswith('video/'): -                    segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList') -                    format_id = r.attrib['id'] -                    video_url = url_el.text -                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) -                    f = { -                        'format_id': format_id, -                        'url': video_url, -                        'width': int_or_none(r.attrib.get('width')), -                        'height': int_or_none(r.attrib.get('height')), -                        'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), -                        'asr': int_or_none(r.attrib.get('audioSamplingRate')), -                        'filesize': filesize, -                        'fps': int_or_none(r.attrib.get('frameRate')), -                    } -                    if segment_list is not None: -                        f.update({ -                            'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], -                            'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], -                            'protocol': 'http_dash_segments', -                        }) -                    try: -                        existing_format = next( -                            fo for fo in formats -                            if fo['format_id'] == format_id) -                    except StopIteration: -                        full_info = self._formats.get(format_id, {}).copy() -                        full_info.update(f) -                        codecs = r.attrib.get('codecs') -                        if codecs: -                            if full_info.get('acodec') == 'none': -                                full_info['vcodec'] = codecs -                            elif full_info.get('vcodec') == 'none': -                                full_info['acodec'] = codecs -                        formats.append(full_info) -                    else: -                        existing_format.update(f) -                else: -                    self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) -        return formats -      def _real_extract(self, url):          url, smuggled_data = unsmuggle_url(url, {}) @@ -1533,8 +1466,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              for dash_manifest_url in dash_mpds:                  dash_formats = {}                  try: -                    for df in self._parse_dash_manifest( -                            video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal): +                    def decrypt_sig(mobj): +                        s = mobj.group(1) +                        dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) +                        return '/signature/%s' % dec_s + +                    dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url) + +                    for df in self._extract_dash_manifest_formats( +                            dash_manifest_url, video_id, fatal=dash_mpd_fatal, +                            namespace='urn:mpeg:DASH:schema:MPD:2011', formats_dict=self._formats):                          # Do not overwrite DASH format found in some previous DASH manifest                          if df['format_id'] not in dash_formats:                              dash_formats[df['format_id']] = df @@ -1602,7 +1543,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): +class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):      IE_DESC = 'YouTube.com playlists'      _VALID_URL = r"""(?x)(?:                          (?:https?://)? @@ -1846,7 +1787,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):  class YoutubeUserIE(YoutubeChannelIE):      IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' -    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' +    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'      _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'      IE_NAME = 'youtube:user' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 433245f00..2137dfb3f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -416,6 +416,11 @@ def parseOpts(overrideArguments=None):          dest='hls_prefer_native', action='store_true',          help='Use the native HLS downloader instead of ffmpeg (experimental)')      downloader.add_option( +        '--hls-use-mpegts', +        dest='hls_use_mpegts', action='store_true', +        help='Use the mpegts container for HLS videos, allowing to play the ' +             'video while downloading (some players may not be able to play it)') +    downloader.add_option(          '--external-downloader',          dest='external_downloader', metavar='COMMAND',          help='Use the specified external downloader. ' diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 48f86e21c..d3d9d4f1d 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -483,6 +483,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):              self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert')              return [], info          self._downloader.to_screen('[ffmpeg] Converting subtitles') +        sub_filenames = []          for lang, sub in subs.items():              ext = sub['ext']              if ext == new_ext: @@ -490,6 +491,8 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):                      '[ffmpeg] Subtitle file for %s is already in the requested'                      'format' % new_ext)                  continue +            old_file = subtitles_filename(filename, lang, ext) +            sub_filenames.append(old_file)              new_file = subtitles_filename(filename, lang, new_ext)              if ext == 'dfxp' or ext == 'ttml': @@ -497,7 +500,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):                      'You have requested to convert dfxp (TTML) subtitles into another format, '                      'which results in style information loss') -                dfxp_file = subtitles_filename(filename, lang, ext) +                dfxp_file = old_file                  srt_file = subtitles_filename(filename, lang, 'srt')                  with io.open(dfxp_file, 'rt', encoding='utf-8') as f: @@ -515,9 +518,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):                  if new_ext == 'srt':                      continue -            self.run_ffmpeg( -                subtitles_filename(filename, lang, ext), -                new_file, ['-f', new_format]) +            self.run_ffmpeg(old_file, new_file, ['-f', new_format])              with io.open(new_file, 'rt', encoding='utf-8') as f:                  subs[lang] = { @@ -525,4 +526,4 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):                      'data': f.read(),                  } -        return [], info +        return sub_filenames, info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c63b61598..18dbe28bb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):          'ttaf1': 'http://www.w3.org/2006/10/ttaf1',      }) -    def parse_node(node): -        str_or_empty = functools.partial(str_or_none, default='') +    class TTMLPElementParser: +        out = '' -        out = str_or_empty(node.text) +        def start(self, tag, attrib): +            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): +                self.out += '\n' -        for child in node: -            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): -                out += '\n' + str_or_empty(child.tail) -            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): -                out += str_or_empty(parse_node(child)) -            else: -                out += str_or_empty(xml.etree.ElementTree.tostring(child)) +        def end(self, tag): +            pass -        return out +        def data(self, data): +            self.out += data + +        def close(self): +            return self.out.strip() + +    def parse_node(node): +        target = TTMLPElementParser() +        parser = xml.etree.ElementTree.XMLParser(target=target) +        parser.feed(xml.etree.ElementTree.tostring(node)) +        return parser.close()      dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))      out = [] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4ac7f9e93..6da42c5a5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2016.01.27' +__version__ = '2016.02.01' | 
