diff options
40 files changed, 494 insertions, 223 deletions
| @@ -124,3 +124,4 @@ Mohammad Teimori Pabandi  Roman Le Négrate  Matthias Küch  Julian Richen +Ping O. @@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms  To install it right away for all UNIX users (Linux, OS X, etc.), type:      sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl -    sudo chmod a+x /usr/local/bin/youtube-dl +    sudo chmod a+rx /usr/local/bin/youtube-dl  If you do not have curl, you can alternatively use a recent wget:      sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl -    sudo chmod a+x /usr/local/bin/youtube-dl +    sudo chmod a+rx /usr/local/bin/youtube-dl  Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43fbe8b1d..a4879bd9a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -142,6 +142,7 @@   - **Eporner**   - **EroProfile**   - **Escapist** + - **ESPN** (Currently broken)   - **EveryonesMixtape**   - **exfm**: ex.fm   - **ExpoTV** @@ -338,6 +339,7 @@   - **OktoberfestTV**   - **on.aol.com**   - **Ooyala** + - **OoyalaExternal**   - **OpenFilm**   - **orf:fm4**: radio FM4   - **orf:iptv**: iptv.ORF.at @@ -451,6 +453,7 @@   - **Spike**   - **Sport5**   - **SportBox** + - **SportBoxEmbed**   - **SportDeutschland**   - **Srf**   - **SRMediathek**: Saarländischer Rundfunk @@ -510,6 +513,8 @@   - **Turbo**   - **Tutv**   - **tv.dfb.de** + - **TV2** + - **TV2Article**   - **TV4**: tv4.se and tv4play.se   - **tvigle**: Интернет-телевидение Tvigle.ru   - **tvp.pl** diff --git a/test/test_utils.py b/test/test_utils.py index b40107037..e13e11b59 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -621,6 +621,21 @@ Line  '''          self.assertEqual(dfxp2srt(dfxp_data), srt_data) +        dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?> +            <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> +            <body> +                <div xml:lang="en"> +                    <p begin="0" end="1">The first line</p> +                </div> +            </body> +            </tt>''' +        srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The first line + +''' +        self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) +  if __name__ == '__main__':      unittest.main() @@ -4,6 +4,8 @@ envlist = py26,py27,py33,py34  deps =     nose     coverage +# We need a valid $HOME for test_compat_expanduser +passenv = HOME  defaultargs = test --exclude test_download.py --exclude test_age_restriction.py      --exclude test_subtitles.py --exclude test_write_annotations.py      --exclude test_youtube_lists.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5df889945..d1953c18f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1368,7 +1368,7 @@ class YoutubeDL(object):                          postprocessors = []                          self.report_warning('You have requested multiple '                                              'formats but ffmpeg or avconv are not installed.' -                                            ' The formats won\'t be merged') +                                            ' The formats won\'t be merged.')                      else:                          postprocessors = [merger] @@ -1395,8 +1395,8 @@ class YoutubeDL(object):                      requested_formats = info_dict['requested_formats']                      if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):                          info_dict['ext'] = 'mkv' -                        self.report_warning('You have requested formats incompatible for merge. ' -                                            'The formats will be merged into mkv') +                        self.report_warning( +                            'Requested formats are incompatible for merge and will be merged into mkv.')                      # Ensure filename always has a correct extension for successful merge                      filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])                      if os.path.exists(encodeFilename(filename)): @@ -1527,6 +1527,7 @@ class YoutubeDL(object):              pps_chain.extend(ie_info['__postprocessors'])          pps_chain.extend(self._pps)          for pp in pps_chain: +            files_to_delete = []              try:                  files_to_delete, info = pp.run(info)              except PostProcessingError as e: diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):      """      NONCE_LENGTH_BYTES = 8 -    data = bytes_to_intlist(base64.b64decode(data)) +    data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))      password = bytes_to_intlist(password.encode('utf-8'))      key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d131d3ec3..79bcd9106 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -339,8 +339,7 @@ from .newstube import NewstubeIE  from .nextmedia import (      NextMediaIE,      NextMediaActionNewsIE, -    AppleDailyRealtimeNewsIE, -    AppleDailyAnimationNewsIE +    AppleDailyIE,  )  from .nfb import NFBIE  from .nfl import NFLIE @@ -573,7 +572,10 @@ from .tumblr import TumblrIE  from .tunein import TuneInIE  from .turbo import TurboIE  from .tutv import TutvIE -from .tv2 import TV2IE +from .tv2 import ( +    TV2IE, +    TV2ArticleIE, +)  from .tv4 import TV4IE  from .tvigle import TvigleIE  from .tvp import TvpIE, TvpSeriesIE @@ -645,7 +647,10 @@ from .vine import (      VineIE,      VineUserIE,  ) -from .viki import VikiIE +from .viki import ( +    VikiIE, +    VikiChannelIE, +)  from .vk import (      VKIE,      VKUserVideosIE, diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):          base64_video_info = self._html_search_regex(              r'var cozVidData = "(.+?)";', webpage, 'video data') -        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") +        decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')          video_info_dict = json.loads(decoded_video_info)          # get video information from dict diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import (  class CNNIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ -        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' +        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''      _TESTS = [{          'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    parse_iso8601, +)  class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):                  restricted_to_denmark = asset['RestrictedToDenmark']                  spoken_subtitles = asset['Target'] == 'SpokenSubtitles'                  for link in asset['Links']: -                    target = link['Target']                      uri = link['Uri'] +                    target = link['Target']                      format_id = target -                    preference = -1 if target == 'HDS' else -2 +                    preference = None                      if spoken_subtitles: -                        preference -= 2 +                        preference = -1                          format_id += '-spoken-subtitles' -                    formats.append({ -                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, -                        'format_id': format_id, -                        'ext': link['FileFormat'], -                        'preference': preference, -                    }) +                    if target == 'HDS': +                        formats.extend(self._extract_f4m_formats( +                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', +                            video_id, preference, f4m_id=format_id)) +                    elif target == 'HLS': +                        formats.extend(self._extract_m3u8_formats( +                            uri, video_id, 'mp4', preference=preference, +                            m3u8_id=format_id)) +                    else: +                        bitrate = link.get('Bitrate') +                        if bitrate: +                            format_id += '-%s' % bitrate +                        formats.append({ +                            'url': uri, +                            'format_id': format_id, +                            'tbr': bitrate, +                            'ext': link.get('FileFormat'), +                        })                  subtitles_list = asset.get('SubtitlesList')                  if isinstance(subtitles_list, list):                      LANGS = { diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 70f8efe27..9a5a8f4bb 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE  class EMPFlixIE(TNAFlixIE): -    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html' +    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'      _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'      _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'      _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' -    _TEST = { -        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', -        'md5': 'b1bc15b6412d33902d6e5952035fcabc', -        'info_dict': { -            'id': '33051', -            'display_id': 'Amateur-Finger-Fuck', -            'ext': 'mp4', -            'title': 'Amateur Finger Fuck', -            'description': 'Amateur solo finger fucking.', -            'thumbnail': 're:https?://.*\.jpg$', -            'age_limit': 18, +    _TESTS = [ +        { +            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', +            'md5': 'b1bc15b6412d33902d6e5952035fcabc', +            'info_dict': { +                'id': '33051', +                'display_id': 'Amateur-Finger-Fuck', +                'ext': 'mp4', +                'title': 'Amateur Finger Fuck', +                'description': 'Amateur solo finger fucking.', +                'thumbnail': 're:https?://.*\.jpg$', +                'age_limit': 18, +            } +        }, +        { +            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', +            'matching_only': True,          } -    } +    ] diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 65f6ca103..b10755788 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -7,9 +7,9 @@ from ..utils import int_or_none  class InstagramIE(InfoExtractor): -    _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)' +    _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)'      _TEST = { -        'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc', +        'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',          'md5': '0d2da106a9d2631273e192b372806516',          'info_dict': {              'id': 'aye83DjauH', @@ -41,11 +41,11 @@ class InstagramIE(InfoExtractor):  class InstagramUserIE(InfoExtractor): -    _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' +    _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'      IE_DESC = 'Instagram user profile'      IE_NAME = 'instagram:user'      _TEST = { -        'url': 'http://instagram.com/porsche', +        'url': 'https://instagram.com/porsche',          'info_dict': {              'id': 'porsche',              'title': 'porsche', diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 1484ac0d2..da896caf1 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -50,9 +50,7 @@ class LetvIE(InfoExtractor):              'title': '与龙共舞 完整版',              'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',          }, -        'params': { -            'cn_verification_proxy': 'http://proxy.uku.im:8888' -        }, +        'skip': 'Only available in China',      }]      @staticmethod diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index d8897eb90..7091f3335 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', -        'md5': '6a75fe9d0d3275bead0cb683c616fddb',          'info_dict': {              'id': '0fce117d',              'ext': 'mp4', @@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor):              'display_id': 'programa-144',              'duration': 2913,          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }]      def _real_extract(self, url): @@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor):              episode,              transform_source=strip_jsonp          ) +        formats = self._extract_m3u8_formats( +            token_info['tokenizedUrl'], episode, ext='mp4')          return {              'id': embed_data['videoId'],              'display_id': episode,              'title': info_el.find('title').text, -            'url': token_info['tokenizedUrl'], +            'formats': formats,              'description': get_element_by_attribute('class', 'text', webpage),              'thumbnail': info_el.find('thumb').text,              'duration': parse_duration(info_el.find('duration').text), diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 862b706bf..944096e1c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):      }, {          'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',          'only_matching': True, +    }, { +        'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', +        'info_dict': { +            'id': '0041400301-cle-atl-recap.nba', +            'ext': 'mp4', +            'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', +            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', +            'duration': 228, +        }, +        'params': { +            'skip_download': True, +        }      }]      def _real_extract(self, url): @@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):              self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')          description = self._og_search_description(webpage) -        duration = parse_duration( -            self._html_search_meta('duration', webpage, 'duration')) +        duration_str = self._html_search_meta( +            'duration', webpage, 'duration', default=None) +        if not duration_str: +            duration_str = self._html_search_regex( +                r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) +        duration = parse_duration(duration_str)          return {              'id': shortened_video_id, diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE):          return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): -    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): +    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'      _TESTS = [{          'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',          'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):              'ext': 'mp4',              'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',              'thumbnail': 're:^https?://.*\.jpg$', -            'description': 'md5:b23787119933404ce515c6356a8c355c', +            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',              'upload_date': '20150128',          }      }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):              'ext': 'mp4',              'title': '不滿被踩腳 山東兩大媽一路打下車',              'thumbnail': 're:^https?://.*\.jpg$', -            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', +            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',              'upload_date': '20150128',          } -    }] - -    _URL_PATTERN = r'\{url: \'(.+)\'\}' - -    def _fetch_title(self, page): -        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title') - -    def _fetch_thumbnail(self, page): -        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - -    def _fetch_timestamp(self, page): -        return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): -    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' -    _TESTS = [{ +    }, {          'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',          'md5': '03df296d95dedc2d5886debbb80cb43f',          'info_dict': { @@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):          'expected_warnings': [              'video thumbnail',          ] +    }, { +        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', +        'only_matching': True,      }] +    _URL_PATTERN = r'\{url: \'(.+)\'\}' +      def _fetch_title(self, page): -        return self._html_search_meta('description', page, 'news title') +        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or +                self._html_search_meta('description', page, 'news title')) + +    def _fetch_thumbnail(self, page): +        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + +    def _fetch_timestamp(self, page): +        return None      def _fetch_description(self, page):          return self._html_search_meta('description', page, 'news description') diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc799664..255d4abc1 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -17,7 +17,7 @@ from ..utils import (  class ProSiebenSat1IE(InfoExtractor):      IE_NAME = 'prosiebensat1'      IE_DESC = 'ProSiebenSat.1 Digital' -    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' +    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'      _TESTS = [          { diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 13113820b..b540033e2 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -26,7 +26,7 @@ class QQMusicIE(InfoExtractor):              'title': '可惜没如果',              'upload_date': '20141227',              'creator': '林俊杰', -            'description': 'md5:4348ff1dd24036906baa7b6f973f8d30', +            'description': 'md5:d327722d0361576fde558f1ac68a7065',          }      }] @@ -60,6 +60,8 @@ class QQMusicIE(InfoExtractor):          lrc_content = self._html_search_regex(              r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',              detail_info_page, 'LRC lyrics', default=None) +        if lrc_content: +            lrc_content = lrc_content.replace('\\n', '\n')          guid = self.m_r_get_ruin() diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 849300140..82cd98ac7 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ from ..utils import (  def _decrypt_url(png): -    encrypted_data = base64.b64decode(png) +    encrypted_data = base64.b64decode(png.encode('utf-8'))      text_index = encrypted_data.find(b'tEXt')      text_chunk = encrypted_data[text_index - 4:]      length = struct_unpack('!I', text_chunk[:4])[0] diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index 55604637d..d9df06861 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):      @classmethod      def _extract_url(cls, webpage):          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)          if mobj:              return mobj.group('url') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 26ced716e..9f3e944e7 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -47,7 +47,7 @@ class SharedIE(InfoExtractor):          video_url = self._html_search_regex(              r'data-url="([^"]+)"', video_page, 'video URL')          title = base64.b64decode(self._html_search_meta( -            'full:title', webpage, 'title')).decode('utf-8') +            'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')          filesize = int_or_none(self._html_search_meta(              'full:size', webpage, 'file size', fatal=False))          thumbnail = self._html_search_regex( diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index eab4adfca..29bd9ce6f 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -23,9 +23,7 @@ class SohuIE(InfoExtractor):              'ext': 'mp4',              'title': 'MV:Far East Movement《The Illest》',          }, -        'params': { -            'cn_verification_proxy': 'proxy.uku.im:8888' -        } +        'skip': 'On available in China',      }, {          'url': 'http://tv.sohu.com/20150305/n409385080.shtml',          'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 8686f9d11..86d509ae5 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -6,8 +6,7 @@ import re  from .common import InfoExtractor  from ..compat import compat_urlparse  from ..utils import ( -    parse_duration, -    parse_iso8601, +    unified_strdate,  ) @@ -20,11 +19,9 @@ class SportBoxIE(InfoExtractor):              'id': '80822',              'ext': 'mp4',              'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', -            'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', +            'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',              'thumbnail': 're:^https?://.*\.jpg$', -            'timestamp': 1411896237,              'upload_date': '20140928', -            'duration': 4846,          },          'params': {              # m3u8 download @@ -48,17 +45,13 @@ class SportBoxIE(InfoExtractor):              r'src="/?(vdl/player/[^"]+)"', webpage, 'player')          title = self._html_search_regex( -            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') -        description = self._html_search_regex( -            r'(?s)<div itemprop="description">(.+?)</div>', -            webpage, 'description', fatal=False) +            [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], +            webpage, 'title') +        description = self._og_search_description(webpage) or self._html_search_meta( +            'description', webpage, 'description')          thumbnail = self._og_search_thumbnail(webpage) -        timestamp = parse_iso8601(self._search_regex( -            r'<span itemprop="uploadDate">([^<]+)</span>', -            webpage, 'timestamp', fatal=False)) -        duration = parse_duration(self._html_search_regex( -            r'<meta itemprop="duration" content="PT([^"]+)">', -            webpage, 'duration', fatal=False)) +        upload_date = unified_strdate(self._html_search_meta( +            'dateCreated', webpage, 'upload date'))          return {              '_type': 'url_transparent', @@ -67,8 +60,7 @@ class SportBoxIE(InfoExtractor):              'title': title,              'description': description,              'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, +            'upload_date': upload_date,          } diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 251a68680..a0c744fd1 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):              'title': 'Con Martín Berasategui, hacer un bacalao al ...',              'duration': 662,          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',          'only_matching': True, diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index d48cbbf14..59af9aba0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,26 +10,32 @@ from ..utils import (  class TNAFlixIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'      _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'      _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'      _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' -    _TEST = { -        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', -        'md5': 'ecf3498417d09216374fc5907f9c6ec0', -        'info_dict': { -            'id': '553878', -            'display_id': 'Carmella-Decesare-striptease', -            'ext': 'mp4', -            'title': 'Carmella Decesare - striptease', -            'description': '', -            'thumbnail': 're:https?://.*\.jpg$', -            'duration': 91, -            'age_limit': 18, +    _TESTS = [ +        { +            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', +            'md5': 'ecf3498417d09216374fc5907f9c6ec0', +            'info_dict': { +                'id': '553878', +                'display_id': 'Carmella-Decesare-striptease', +                'ext': 'mp4', +                'title': 'Carmella Decesare - striptease', +                'description': '', +                'thumbnail': 're:https?://.*\.jpg$', +                'duration': 91, +                'age_limit': 18, +            } +        }, +        { +            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', +            'matching_only': True,          } -    } +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4de0aac52..fad720b68 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):          data_content = self._download_webpage(              'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') -        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') +        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')          return {              'id': internal_id, diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 2dcc0e971..fa338b936 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -1,12 +1,15 @@  # encoding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      determine_ext,      int_or_none,      float_or_none,      parse_iso8601, +    remove_end,  ) @@ -91,3 +94,33 @@ class TV2IE(InfoExtractor):              'categories': categories,              'formats': formats,          } + + +class TV2ArticleIE(InfoExtractor): +    _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', +        'info_dict': { +            'id': '6930542', +            'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', +            'description': 'md5:339573779d3eea3542ffe12006190954', +        }, +        'playlist_count': 2, +    }, { +        'url': 'http://www.tv2.no/a/6930542', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        entries = [ +            self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') +            for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + +        title = remove_end(self._og_search_title(webpage), ' - TV2.no') +        description = remove_end(self._og_search_description(webpage), ' - TV2.no') + +        return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index 96c809eaf..c4751050e 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse  from ..utils import (      ExtractorError,      qualities, @@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        deliver_url = self._search_regex( -            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', -            webpage, 'deliver URL') +        deliver_url = self._proto_relative_url(self._search_regex( +            r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"', +            webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')          deliver_page = self._download_webpage(              deliver_url, video_id, 'Downloading iframe page') @@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor):          player = self._parse_json(              self._search_regex( -                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), +                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", +                deliver_page, 'player'),              video_id)          quality = qualities(['flash', 'html5']) diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index ececc7ee0..591024ead 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):          formats = [              { -                'url': base64.b64decode(res['u']).decode('utf-8'), +                'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),                  'ext': 'flv',                  'format_id': res['l'],              } for res in settings['res'] if res['u'] diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 619039e51..15377097e 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -38,11 +38,14 @@ class VierIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          video_id = self._search_regex( -            r'"nid"\s*:\s*"(\d+)"', webpage, 'video id') +            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], +            webpage, 'video id')          application = self._search_regex( -            r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod') +            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], +            webpage, 'application', default='vier_vod')          filename = self._search_regex( -            r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename') +            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], +            webpage, 'filename')          playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)          formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index cf6af1e5c..7f2fb1ca8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,29 +1,65 @@  from __future__ import unicode_literals -import re +import time +import hmac +import hashlib +import itertools -from ..compat import ( -    compat_urlparse, -    compat_urllib_request, -)  from ..utils import (      ExtractorError, -    unescapeHTML, -    unified_strdate, -    US_RATINGS, -    determine_ext, -    mimetype2ext, +    int_or_none, +    parse_age_limit, +    parse_iso8601,  )  from .common import InfoExtractor -class VikiIE(InfoExtractor): -    IE_NAME = 'viki' +class VikiBaseIE(InfoExtractor): +    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' +    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' +    _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' + +    _APP = '65535a' +    _APP_VERSION = '2.2.5.1428709186' +    _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + +    def _prepare_call(self, path, timestamp=None): +        path += '?' if '?' not in path else '&' +        if not timestamp: +            timestamp = int(time.time()) +        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) +        sig = hmac.new( +            self._APP_SECRET.encode('ascii'), +            query.encode('ascii'), +            hashlib.sha1 +        ).hexdigest() +        return self._API_URL_TEMPLATE % (query, sig) + +    def _call_api(self, path, video_id, note, timestamp=None): +        resp = self._download_json( +            self._prepare_call(path, timestamp), video_id, note) + +        error = resp.get('error') +        if error: +            if error == 'invalid timestamp': +                resp = self._download_json( +                    self._prepare_call(path, int(resp['current_timestamp'])), +                    video_id, '%s (retry)' % note) +                error = resp.get('error') +            if error: +                self._raise_error(resp['error']) + +        return resp -    # iPad2 -    _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5' +    def _raise_error(self, error): +        raise ExtractorError( +            '%s returned error: %s' % (self.IE_NAME, error), +            expected=True) -    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' + +class VikiIE(VikiBaseIE): +    IE_NAME = 'viki' +    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE      _TESTS = [{          'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',          'info_dict': { @@ -37,111 +73,218 @@ class VikiIE(InfoExtractor):          },          'skip': 'Blocked in the US',      }, { +        # clip          'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', -        'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c', +        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',          'info_dict': {              'id': '1067139v',              'ext': 'mp4', +            'title': "'The Avengers: Age of Ultron' Press Conference",              'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', +            'duration': 352, +            'timestamp': 1430380829,              'upload_date': '20150430', -            'title': '\'The Avengers: Age of Ultron\' Press Conference', +            'uploader': 'Arirang TV', +            'like_count': int, +            'age_limit': 0,          }      }, {          'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',          'info_dict': {              'id': '1048879v',              'ext': 'mp4', -            'upload_date': '20140820', -            'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',              'title': 'Ankhon Dekhi', +            'duration': 6512, +            'timestamp': 1408532356, +            'upload_date': '20140820', +            'uploader': 'Spuul', +            'like_count': int, +            'age_limit': 13,          },          'params': { -            # requires ffmpeg +            # m3u8 download              'skip_download': True,          } +    }, { +        # episode +        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', +        'md5': '190f3ef426005ba3a080a63325955bc3', +        'info_dict': { +            'id': '44699v', +            'ext': 'mp4', +            'title': 'Boys Over Flowers - Episode 1', +            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2', +            'duration': 4155, +            'timestamp': 1270496524, +            'upload_date': '20100405', +            'uploader': 'group8', +            'like_count': int, +            'age_limit': 13, +        } +    }, { +        # youtube external +        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', +        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', +        'info_dict': { +            'id': '50562v', +            'ext': 'mp4', +            'title': 'Poor Nastya [COMPLETE] - Episode 1', +            'description': '', +            'duration': 607, +            'timestamp': 1274949505, +            'upload_date': '20101213', +            'uploader': 'ad14065n', +            'uploader_id': 'ad14065n', +            'like_count': int, +            'age_limit': 13, +        } +    }, { +        'url': 'http://www.viki.com/player/44699v', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        title = self._og_search_title(webpage) -        description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage) - -        uploader_m = re.search( -            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage) -        if uploader_m is None: -            uploader = None -        else: -            uploader = uploader_m.group(1).strip() - -        rating_str = self._html_search_regex( -            r'<strong>Rating: </strong>\s*([^<]*)<', webpage, -            'rating information', default='').strip() -        age_limit = US_RATINGS.get(rating_str) - -        req = compat_urllib_request.Request( -            'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id) -        req.add_header('User-Agent', self._USER_AGENT) -        info_webpage = self._download_webpage( -            req, video_id, note='Downloading info page') -        err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None) -        if err_msg: -            if 'not available in your region' in err_msg: -                raise ExtractorError( -                    'Video %s is blocked from your location.' % video_id, -                    expected=True) -            else: -                raise ExtractorError('Viki said: ' + err_msg) -        mobj = re.search( -            r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage) -        if not mobj: -            raise ExtractorError('Unable to find video URL') -        video_url = unescapeHTML(mobj.group('url')) -        video_ext = mimetype2ext(mobj.group('mime_type')) - -        if determine_ext(video_url) == 'm3u8': -            formats = self._extract_m3u8_formats( -                video_url, video_id, ext=video_ext) -        else: -            formats = [{ -                'url': video_url, -                'ext': video_ext, -            }] - -        upload_date_str = self._html_search_regex( -            r'"created_at":"([^"]+)"', info_webpage, 'upload date') -        upload_date = ( -            unified_strdate(upload_date_str) -            if upload_date_str is not None -            else None -        ) - -        # subtitles -        video_subtitles = self.extract_subtitles(video_id, info_webpage) - -        return { +        video = self._call_api( +            'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + +        title = None +        titles = video.get('titles') +        if titles: +            title = titles.get('en') or titles[titles.keys()[0]] +        if not title: +            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id +            container_titles = video.get('container', {}).get('titles') +            if container_titles: +                container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]] +                title = '%s - %s' % (container_title, title) + +        descriptions = video.get('descriptions') +        description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None + +        duration = int_or_none(video.get('duration')) +        timestamp = parse_iso8601(video.get('created_at')) +        uploader = video.get('author') +        like_count = int_or_none(video.get('likes', {}).get('count')) +        age_limit = parse_age_limit(video.get('rating')) + +        thumbnails = [] +        for thumbnail_id, thumbnail in video.get('images', {}).items(): +            thumbnails.append({ +                'id': thumbnail_id, +                'url': thumbnail.get('url'), +            }) + +        subtitles = {} +        for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): +            subtitles[subtitle_lang] = [{ +                'ext': subtitles_format, +                'url': self._prepare_call( +                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), +            } for subtitles_format in ('srt', 'vtt')] + +        result = {              'id': video_id,              'title': title, -            'formats': formats,              'description': description, -            'thumbnail': thumbnail, -            'age_limit': age_limit, +            'duration': duration, +            'timestamp': timestamp,              'uploader': uploader, -            'subtitles': video_subtitles, -            'upload_date': upload_date, +            'like_count': like_count, +            'age_limit': age_limit, +            'thumbnails': thumbnails, +            'subtitles': subtitles,          } -    def _get_subtitles(self, video_id, info_webpage): -        res = {} -        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage): -            sturl = unescapeHTML(sturl_html) -            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) -            if not m: -                continue -            res[m.group('lang')] = [{ -                'url': compat_urlparse.urljoin('http://www.viki.com', sturl), -                'ext': 'vtt', -            }] -        return res +        streams = self._call_api( +            'videos/%s/streams.json' % video_id, video_id, +            'Downloading video streams JSON') + +        if 'external' in streams: +            result.update({ +                '_type': 'url_transparent', +                'url': streams['external']['url'], +            }) +            return result + +        formats = [] +        for format_id, stream_dict in streams.items(): +            height = self._search_regex( +                r'^(\d+)[pP]$', format_id, 'height', default=None) +            for protocol, format_dict in stream_dict.items(): +                if format_id == 'm3u8': +                    formats = self._extract_m3u8_formats( +                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) +                else: +                    formats.append({ +                        'url': format_dict['url'], +                        'format_id': '%s-%s' % (format_id, protocol), +                        'height': height, +                    }) +        self._sort_formats(formats) + +        result['formats'] = formats +        return result + + +class VikiChannelIE(VikiBaseIE): +    IE_NAME = 'viki:channel' +    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE +    _TESTS = [{ +        'url': 'http://www.viki.com/tv/50c-boys-over-flowers', +        'info_dict': { +            'id': '50c', +            'title': 'Boys Over Flowers', +            'description': 'md5:ecd3cff47967fe193cff37c0bec52790', +        }, +        'playlist_count': 70, +    }, { +        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', +        'info_dict': { +            'id': '1354c', +            'title': 'Poor Nastya [COMPLETE]', +            'description': 'md5:05bf5471385aa8b21c18ad450e350525', +        }, +        'playlist_count': 127, +    }, { +        'url': 'http://www.viki.com/news/24569c-showbiz-korea', +        'only_matching': True, +    }, { +        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', +        'only_matching': True, +    }, { +        'url': 'http://www.viki.com/artists/2141c-shinee', +        'only_matching': True, +    }] + +    _PER_PAGE = 25 + +    def _real_extract(self, url): +        channel_id = self._match_id(url) + +        channel = self._call_api( +            'containers/%s.json' % channel_id, channel_id, +            'Downloading channel JSON') + +        titles = channel['titles'] +        title = titles.get('en') or titles[titles.keys()[0]] + +        descriptions = channel['descriptions'] +        description = descriptions.get('en') or descriptions[descriptions.keys()[0]] + +        entries = [] +        for video_type in ('episodes', 'clips', 'movies'): +            for page_num in itertools.count(1): +                page = self._call_api( +                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' +                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, +                    'Downloading %s JSON page #%d' % (video_type, page_num)) +                for video in page['response']: +                    video_id = video['id'] +                    entries.append(self.url_result( +                        'http://www.viki.com/videos/%s' % video_id, 'Viki')) +                if not page['pagination']['next']: +                    break + +        return self.playlist_result(entries, channel_id, title, description) diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index c3fde53f5..a6d9b5fee 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):          links_code = self._search_regex(              r'''(?xs)                  (?: -                    <img\s+src="/im/play.gif".*?>| +                    <img\s+src="[^"]*/play.gif".*?>|                      <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->                  )                  (.*?) diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py index 1eb24a3d6..faa167e65 100644 --- a/youtube_dl/extractor/vulture.py +++ b/youtube_dl/extractor/vulture.py @@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):          query_webpage = self._download_webpage(              query_url, display_id, note='Downloading query page')          params_json = self._search_regex( -            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n', +            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',              query_webpage,              'player params')          params = json.loads(params_json) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index d6dec25ca..f69d46a28 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -37,7 +37,8 @@ class WimpIE(InfoExtractor):          video_id = mobj.group(1)          webpage = self._download_webpage(url, video_id)          video_url = self._search_regex( -            r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL') +            [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], +            webpage, 'video URL')          if YoutubeIE.suitable(video_url):              self.to_screen('Found YouTube video')              return { diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py index 8c6241aed..7c9d8af6f 100644 --- a/youtube_dl/extractor/xminus.py +++ b/youtube_dl/extractor/xminus.py @@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):              r'minus_track\.dur_sec=\'([0-9]*?)\'',              webpage, 'duration', fatal=False))          filesize_approx = parse_filesize(self._html_search_regex( -            r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])', +            r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',              webpage, 'approximate filesize', fatal=False))          tbr = int_or_none(self._html_search_regex(              r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps', @@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):              description = re.sub(' *\r *', '\n', description)          enc_token = self._html_search_regex( -            r'minus_track\.tkn="(.+?)"', webpage, 'enc_token') +            r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')          token = ''.join(              c if pos == 3 else compat_chr(compat_ord(c) - 1)              for pos, c in enumerate(reversed(enc_token))) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index bf4e659ac..f9afbdbab 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,6 +15,7 @@ from ..utils import (      unescapeHTML,      ExtractorError,      int_or_none, +    mimetype2ext,  )  from .nbc import NBCSportsVPlayerIE @@ -236,6 +237,22 @@ class YahooIE(InfoExtractor):          self._sort_formats(formats) +        closed_captions = self._html_search_regex( +            r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', +            default='[]') + +        cc_json = self._parse_json(closed_captions, video_id, fatal=False) +        subtitles = {} +        if cc_json: +            for closed_caption in cc_json: +                lang = closed_caption['lang'] +                if lang not in subtitles: +                    subtitles[lang] = [] +                subtitles[lang].append({ +                    'url': closed_caption['url'], +                    'ext': mimetype2ext(closed_caption['content_type']), +                }) +          return {              'id': video_id,              'display_id': display_id, @@ -244,6 +261,7 @@ class YahooIE(InfoExtractor):              'description': clean_html(meta['description']),              'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),              'duration': int_or_none(meta.get('duration')), +            'subtitles': subtitles,          } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 22dbc3aec..5a2315bd9 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None):      verbosity.add_option(          '--dump-pages', '--dump-intermediate-pages',          action='store_true', dest='dump_intermediate_pages', default=False, -        help='Print downloaded pages to debug problems (very verbose)') +        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')      verbosity.add_option(          '--write-pages',          action='store_true', dest='write_pages', default=False, @@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None):          help='Parse additional metadata like song title / artist from the video title. '               'The format syntax is the same as --output, '               'the parsed parameters replace existing values. ' -             'Additional templates: %(album), %(artist). ' +             'Additional templates: %(album)s, %(artist)s. '               'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '               '"Coldplay - Paradise"')      postproc.add_option( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ed9ed9ed6..52d198fa3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1665,6 +1665,7 @@ def mimetype2ext(mt):      return {          'x-ms-wmv': 'wmv',          'x-mp4-fragmented': 'mp4', +        'ttml+xml': 'ttml',      }.get(res, res) @@ -1848,9 +1849,9 @@ def dfxp2srt(dfxp_data):          out = str_or_empty(node.text)          for child in node: -            if child.tag == _x('ttml:br'): +            if child.tag in (_x('ttml:br'), 'br'):                  out += '\n' + str_or_empty(child.tail) -            elif child.tag == _x('ttml:span'): +            elif child.tag in (_x('ttml:span'), 'span'):                  out += str_or_empty(parse_node(child))              else:                  out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1859,7 +1860,10 @@ def dfxp2srt(dfxp_data):      dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))      out = [] -    paras = dfxp.findall(_x('.//ttml:p')) +    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + +    if not paras: +        raise ValueError('Invalid dfxp/TTML subtitle')      for para, index in zip(paras, itertools.count(1)):          begin_time = parse_dfxp_time_expr(para.attrib['begin']) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 38f00bc9b..b33385153 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.05.15' +__version__ = '2015.05.20' | 
