diff options
| -rw-r--r-- | test/test_InfoExtractor.py | 178 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 43 | 
2 files changed, 214 insertions, 7 deletions
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index d23d94349..71f6608fe 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -107,6 +107,184 @@ class TestInfoExtractor(unittest.TestCase):          self.assertRaises(ExtractorError, self.ie._download_json, uri, None)          self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) +    def test_parse_html5_media_entries(self): +        # from https://www.r18.com/ +        # with kpbs in label +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://www.r18.com/', +                r''' +                <video id="samplevideo_amateur" class="js-samplevideo video-js vjs-default-skin vjs-big-play-centered" controls preload="auto" width="400" height="225" poster="//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg"> +                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4" type="video/mp4"  res="240" label="300kbps"> +                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4" type="video/mp4"  res="480" label="1000kbps"> +                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4" type="video/mp4"  res="740" label="1500kbps"> +                    <p>Your browser does not support the video tag.</p> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4', +                    'ext': 'mp4', +                    'format_id': '300kbps', +                    'height': 240, +                    'tbr': 300, +                }, { +                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4', +                    'ext': 'mp4', +                    'format_id': '1000kbps', +                    'height': 480, +                    'tbr': 1000, +                }, { +                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4', +                    'ext': 'mp4', +                    'format_id': '1500kbps', +                    'height': 740, +                    'tbr': 1500, +                }], +                'thumbnail': '//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg' +            }) + +        # from https://www.csfd.cz/ +        # with width and height +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://www.csfd.cz/', +                r''' +                <video width="770" height="328" preload="none" controls  poster="https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360" > +                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4" type="video/mp4" width="640" height="360"> +                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4" type="video/mp4" width="1280" height="720"> +                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4" type="video/mp4" width="1920" height="1080"> +                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm" type="video/webm" width="640" height="360"> +                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm" type="video/webm" width="1280" height="720"> +                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm" type="video/webm" width="1920" height="1080"> +                    <track src="https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt" type="text/x-srt" kind="subtitles" srclang="cs" label="cs"> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4', +                    'ext': 'mp4', +                    'width': 640, +                    'height': 360, +                }, { +                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4', +                    'ext': 'mp4', +                    'width': 1280, +                    'height': 720, +                }, { +                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4', +                    'ext': 'mp4', +                    'width': 1920, +                    'height': 1080, +                }, { +                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm', +                    'ext': 'webm', +                    'width': 640, +                    'height': 360, +                }, { +                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm', +                    'ext': 'webm', +                    'width': 1280, +                    'height': 720, +                }, { +                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm', +                    'ext': 'webm', +                    'width': 1920, +                    'height': 1080, +                }], +                'subtitles': { +                    'cs': [{'url': 'https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt'}] +                }, +                'thumbnail': 'https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360' +            }) + +        # from https://tamasha.com/v/Kkdjw +        # with height in label +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://tamasha.com/v/Kkdjw', +                r''' +                <video crossorigin="anonymous"> +                        <source src="https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4" type="video/mp4" label="AUTO" res="0"/> +                                <source src="https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4" type="video/mp4" +                                        label="240p" res="240"/> +                                <source src="https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4" type="video/mp4" +                                        label="144p" res="144"/> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4', +                }, { +                    'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4', +                    'ext': 'mp4', +                    'format_id': '240p', +                    'height': 240, +                }, { +                    'url': 'https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4', +                    'ext': 'mp4', +                    'format_id': '144p', +                    'height': 144, +                }] +            }) + +        # from https://www.directvnow.com +        # with data-src +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://www.directvnow.com', +                r''' +                <video id="vid1" class="header--video-masked active" muted playsinline> +                    <source data-src="https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4" type="video/mp4" /> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'ext': 'mp4', +                    'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4', +                }] +            }) + +        # from https://www.directvnow.com +        # with data-src +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://www.directvnow.com', +                r''' +                <video id="vid1" class="header--video-masked active" muted playsinline> +                    <source data-src="https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4" type="video/mp4" /> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4', +                    'ext': 'mp4', +                }] +            }) + +        # from https://www.klarna.com/uk/ +        # with data-video-src +        expect_dict( +            self, +            self.ie._parse_html5_media_entries( +                'https://www.directvnow.com', +                r''' +                <video loop autoplay muted class="responsive-video block-kl__video video-on-medium"> +                    <source src="" data-video-desktop data-video-src="https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4" type="video/mp4" /> +                </video> +                ''', None)[0], +            { +                'formats': [{ +                    'url': 'https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4', +                    'ext': 'mp4', +                }], +            }) +      def test_extract_jwplayer_data_realworld(self):          # from http://www.suffolk.edu/sjc/          expect_dict( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c291bc1df..0889288f0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import (      compiled_regex_type,      determine_ext,      determine_protocol, +    dict_get,      error_to_compat_str,      ExtractorError,      extract_attributes, @@ -56,13 +57,16 @@ from ..utils import (      JSON_LD_RE,      mimetype2ext,      orderedSet, +    parse_bitrate,      parse_codecs,      parse_duration,      parse_iso8601,      parse_m3u8_attributes, +    parse_resolution,      RegexNotFoundError,      sanitized_Request,      sanitize_filename, +    str_or_none,      unescapeHTML,      unified_strdate,      unified_timestamp, @@ -2481,18 +2485,43 @@ class InfoExtractor(object):              media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))              if media_content:                  for source_tag in re.findall(r'<source[^>]+>', media_content): -                    source_attributes = extract_attributes(source_tag) -                    src = source_attributes.get('src') +                    s_attr = extract_attributes(source_tag) +                    # data-video-src and data-src are non standard but seen +                    # several times in the wild +                    src = dict_get(s_attr, ('src', 'data-video-src', 'data-src'))                      if not src:                          continue -                    f = parse_content_type(source_attributes.get('type')) +                    f = parse_content_type(s_attr.get('type'))                      is_plain_url, formats = _media_formats(src, media_type, f)                      if is_plain_url: -                        # res attribute is not standard but seen several times -                        # in the wild +                        # width, height, res, label and title attributes are +                        # all not standard but seen several times in the wild +                        labels = [ +                            s_attr.get(lbl) +                            for lbl in ('label', 'title') +                            if str_or_none(s_attr.get(lbl)) +                        ] +                        width = int_or_none(s_attr.get('width')) +                        height = (int_or_none(s_attr.get('height')) or +                                  int_or_none(s_attr.get('res'))) +                        if not width or not height: +                            for lbl in labels: +                                resolution = parse_resolution(lbl) +                                if not resolution: +                                    continue +                                width = width or resolution.get('width') +                                height = height or resolution.get('height') +                        for lbl in labels: +                            tbr = parse_bitrate(lbl) +                            if tbr: +                                break +                        else: +                            tbr = None                          f.update({ -                            'height': int_or_none(source_attributes.get('res')), -                            'format_id': source_attributes.get('label'), +                            'width': width, +                            'height': height, +                            'tbr': tbr, +                            'format_id': s_attr.get('label') or s_attr.get('title'),                          })                          f.update(formats[0])                          media_info['formats'].append(f)  | 
