diff options
| -rw-r--r-- | test/test_utils.py | 38 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/cspan.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/ellentv.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/instagram.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/orf.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/southpark.py | 15 | ||||
| -rw-r--r-- | youtube_dl/postprocessor/embedthumbnail.py | 3 | ||||
| -rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 25 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 53 | 
11 files changed, 160 insertions, 18 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index 2e3a6480c..17017a8c0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -58,6 +58,8 @@ from youtube_dl.utils import (      xpath_text,      render_table,      match_str, +    parse_dfxp_time_expr, +    dfxp2srt,  ) @@ -581,6 +583,42 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')              'like_count > 100 & dislike_count <? 50 & description',              {'like_count': 190, 'dislike_count': 10})) +    def test_parse_dfxp_time_expr(self): +        self.assertEqual(parse_dfxp_time_expr(None), 0.0) +        self.assertEqual(parse_dfxp_time_expr(''), 0.0) +        self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1) +        self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1) +        self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0) +        self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1) + +    def test_dfxp2srt(self): +        dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?> +            <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> +            <body> +                <div xml:lang="en"> +                    <p begin="0" end="1">The following line contains Chinese characters and special symbols</p> +                    <p begin="1" end="2">第二行<br/>♪♪</p> +                    <p begin="2" end="3"><span>Third<br/>Line</span></p> +                </div> +            </body> +            </tt>''' +        srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The following line contains Chinese characters and special symbols + +2 +00:00:01,000 --> 00:00:02,000 +第二行 +♪♪ + +3 +00:00:02,000 --> 00:00:03,000 +Third +Line + +''' +        self.assertEqual(dfxp2srt(dfxp_data), srt_data) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a64afa1da..ab80fd5e0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -478,6 +478,7 @@ from .soundgasm import (  )  from .southpark import (      SouthParkIE, +    SouthParkEsIE,      SouthparkDeIE,  )  from .space import SpaceIE diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d516b1402..fbefd37d0 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -8,6 +8,7 @@ from ..utils import (      unescapeHTML,      find_xpath_attr,      smuggle_url, +    determine_ext,  )  from .senateisvp import SenateISVPIE @@ -87,6 +88,10 @@ class CSpanIE(InfoExtractor):              return self.url_result(surl, 'SenateISVP', video_id, title)          files = data['video']['files'] +        try: +            capfile = data['video']['capfile']['#text'] +        except KeyError: +            capfile = None          entries = [{              'id': '%s_%d' % (video_id, partnum + 1), @@ -97,6 +102,12 @@ class CSpanIE(InfoExtractor):              'description': description,              'thumbnail': thumbnail,              'duration': int_or_none(f.get('length', {}).get('#text')), +            'subtitles': { +                'en': [{ +                    'url': capfile, +                    'ext': determine_ext(capfile, 'dfxp') +                }], +            } if capfile else None,          } for partnum, f in enumerate(files)]          if len(entries) == 1: diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 74b50bca2..02c6a4615 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -6,7 +6,6 @@ import json  from .common import InfoExtractor  from ..utils import (      ExtractorError, -    parse_iso8601,  ) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index b020e2621..65f6ca103 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,13 +3,11 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ( -    int_or_none, -) +from ..utils import int_or_none  class InstagramIE(InfoExtractor): -    _VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/' +    _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)'      _TEST = {          'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',          'md5': '0d2da106a9d2631273e192b372806516', @@ -23,8 +21,8 @@ class InstagramIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) +          webpage = self._download_webpage(url, video_id)          uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',                                           webpage, 'uploader id', fatal=False) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 4430b3416..b48fac5e3 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -25,6 +25,7 @@ def _media_xml_tag(tag):  class MTVServicesInfoExtractor(InfoExtractor):      _MOBILE_TEMPLATE = None +    _LANG = None      @staticmethod      def _id_from_uri(uri): @@ -169,8 +170,12 @@ class MTVServicesInfoExtractor(InfoExtractor):          video_id = self._id_from_uri(uri)          feed_url = self._get_feed_url(uri)          data = compat_urllib_parse.urlencode({'uri': uri}) +        info_url = feed_url + '?' +        if self._LANG: +            info_url += 'lang=%s&' % self._LANG +        info_url += data          idoc = self._download_xml( -            feed_url + '?' + data, video_id, +            info_url, video_id,              'Downloading info', transform_source=fix_xml_ampersands)          return self.playlist_result(              [self._get_video_info(item) for item in idoc.findall('.//item')]) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index ca1a5bb3c..2e6c9872b 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -210,16 +210,16 @@ class ORFIPTVIE(InfoExtractor):      _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'      _TEST = { -        'url': 'http://iptv.orf.at/stories/2267952', -        'md5': '26ffa4bab6dbce1eee78bbc7021016cd', +        'url': 'http://iptv.orf.at/stories/2275236/', +        'md5': 'c8b22af4718a4b4af58342529453e3e5',          'info_dict': { -            'id': '339775', +            'id': '350612',              'ext': 'flv', -            'title': 'Kreml-Kritiker Nawalny wieder frei', -            'description': 'md5:6f24e7f546d364dacd0e616a9e409236', -            'duration': 84.729, +            'title': 'Weitere Evakuierungen um Vulkan Calbuco', +            'description': 'md5:d689c959bdbcf04efeddedbf2299d633', +            'duration': 68.197,              'thumbnail': 're:^https?://.*\.jpg$', -            'upload_date': '20150306', +            'upload_date': '20150425',          },      } diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index c20397b3d..77758bbed 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -5,7 +5,7 @@ from .mtv import MTVServicesInfoExtractor  class SouthParkIE(MTVServicesInfoExtractor):      IE_NAME = 'southpark.cc.com' -    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' +    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'      _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -20,9 +20,20 @@ class SouthParkIE(MTVServicesInfoExtractor):      }] +class SouthParkEsIE(SouthParkIE): +    IE_NAME = 'southpark.cc.com:espanol' +    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' +    _LANG = 'es' + +    _TESTS = [{ +        'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', +        'playlist_count': 4, +    }] + +  class SouthparkDeIE(SouthParkIE):      IE_NAME = 'southpark.de' -    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' +    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'      _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'      _TESTS = [{ diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index a2d6b14db..7ba98a0ea 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -35,7 +35,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor):          compat_urlretrieve(info['thumbnail'], temp_thumbnail)          if info['ext'] == 'mp3': -            options = ['-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1', +            options = [ +                '-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',                  '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']              self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 7a952963e..1765f4969 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -20,6 +20,7 @@ from ..utils import (      prepend_extension,      shell_quote,      subtitles_filename, +    dfxp2srt,  ) @@ -651,6 +652,30 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):                      'format' % new_ext)                  continue              new_file = subtitles_filename(filename, lang, new_ext) + +            if ext == 'dfxp' or ext == 'ttml': +                self._downloader.report_warning( +                    'You have requested to convert dfxp (TTML) subtitles into another format, ' +                    'which results in style information loss') + +                dfxp_file = subtitles_filename(filename, lang, ext) +                srt_file = subtitles_filename(filename, lang, 'srt') + +                with io.open(dfxp_file, 'rt', encoding='utf-8') as f: +                    srt_data = dfxp2srt(f.read()) + +                with io.open(srt_file, 'wt', encoding='utf-8') as f: +                    f.write(srt_data) + +                ext = 'srt' +                subs[lang] = { +                    'ext': 'srt', +                    'data': srt_data +                } + +                if new_ext == 'srt': +                    continue +              self.run_ffmpeg(                  subtitles_filename(filename, lang, ext),                  new_file, ['-f', new_format]) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index edeee1853..5e1c4525d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1800,6 +1800,59 @@ def match_filter_func(filter_str):      return _match_func +def parse_dfxp_time_expr(time_expr): +    if not time_expr: +        return 0.0 + +    mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) +    if mobj: +        return float(mobj.group('time_offset')) + +    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) +    if mobj: +        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + + +def format_srt_time(seconds): +    (mins, secs) = divmod(seconds, 60) +    (hours, mins) = divmod(mins, 60) +    millisecs = (secs - int(secs)) * 1000 +    secs = int(secs) +    return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs) + + +def dfxp2srt(dfxp_data): +    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + +    def parse_node(node): +        str_or_empty = functools.partial(str_or_none, default='') + +        out = str_or_empty(node.text) + +        for child in node: +            if child.tag == _x('ttml:br'): +                out += '\n' + str_or_empty(child.tail) +            elif child.tag == _x('ttml:span'): +                out += str_or_empty(parse_node(child)) +            else: +                out += str_or_empty(xml.etree.ElementTree.tostring(child)) + +        return out + +    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) +    out = [] +    paras = dfxp.findall(_x('.//ttml:p')) + +    for para, index in zip(paras, itertools.count(1)): +        out.append('%d\n%s --> %s\n%s\n\n' % ( +            index, +            format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))), +            format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))), +            parse_node(para))) + +    return ''.join(out) + +  class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):      def __init__(self, proxies=None):          # Set default handlers | 
