diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/aftenposten.py | 103 | ||||
| -rw-r--r-- | youtube_dl/extractor/aparat.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 34 | ||||
| -rw-r--r-- | youtube_dl/extractor/goshgay.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/izlesene.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/rtp.py | 43 | ||||
| -rw-r--r-- | youtube_dl/extractor/rts.py | 28 | ||||
| -rw-r--r-- | youtube_dl/extractor/soulanime.py | 80 | ||||
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/tvigle.py | 22 | ||||
| -rw-r--r-- | youtube_dl/extractor/tweakers.py | 58 | 
12 files changed, 257 insertions, 135 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 047f7002a..0d7a120bc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .academicearth import AcademicEarthCourseIE  from .addanime import AddAnimeIE  from .adobetv import AdobeTVIE  from .adultswim import AdultSwimIE +from .aftenposten import AftenpostenIE  from .aftonbladet import AftonbladetIE  from .aljazeera import AlJazeeraIE  from .alphaporno import AlphaPornoIE diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py new file mode 100644 index 000000000..2b257ede7 --- /dev/null +++ b/youtube_dl/extractor/aftenposten.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_iso8601, +    xpath_with_ns, +    xpath_text, +    find_xpath_attr, +) + + +class AftenpostenIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/([^/]+/)*(?P<id>[^/]+)-\d+\.html' + +    _TEST = { +        'url': 'http://www.aftenposten.no/webtv/serier-og-programmer/sweatshopenglish/TRAILER-SWEATSHOP---I-cant-take-any-more-7800835.html?paging=§ion=webtv_serierogprogrammer_sweatshop_sweatshopenglish', +        'md5': 'fd828cd29774a729bf4d4425fe192972', +        'info_dict': { +            'id': '21039', +            'ext': 'mov', +            'title': 'TRAILER: "Sweatshop" - I can´t take any more', +            'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', +            'timestamp': 1416927969, +            'upload_date': '20141125', +        } +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._html_search_regex( +            r'data-xs-id="(\d+)"', webpage, 'video id') + +        data = self._download_xml( +            'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id) + +        NS_MAP = { +            'atom': 'http://www.w3.org/2005/Atom', +            'xt': 'http://xstream.dk/', +            'media': 'http://search.yahoo.com/mrss/', +        } + +        entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) + +        title = xpath_text( +            entry, xpath_with_ns('./atom:title', NS_MAP), 'title') +        description = xpath_text( +            entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') +        timestamp = parse_iso8601(xpath_text( +            entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) + +        formats = [] +        media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) +        for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): +            media_url = media_content.get('url') +            if not media_url: +                continue +            tbr = int_or_none(media_content.get('bitrate')) +            mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url) +            if mobj: +                formats.append({ +                    'url': mobj.group('url'), +                    'play_path': 'mp4:%s' % mobj.group('playpath'), +                    'app': mobj.group('app'), +                    'ext': 'flv', +                    'tbr': tbr, +                    'format_id': 'rtmp-%d' % tbr, +                }) +            else: +                formats.append({ +                    'url': media_url, +                    'tbr': tbr, +                }) +        self._sort_formats(formats) + +        link = find_xpath_attr( +            entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') +        if link is not None: +            formats.append({ +                'url': link.get('href'), +                'format_id': link.get('rel'), +            }) + +        thumbnails = [{ +            'url': splash.get('url'), +            'width': int_or_none(splash.get('width')), +            'height': int_or_none(splash.get('height')), +        } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'timestamp': timestamp, +            'formats': formats, +            'thumbnails': thumbnails, +        } diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 15006336f..63429780e 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -20,6 +20,7 @@ class AparatIE(InfoExtractor):              'id': 'wP8On',              'ext': 'mp4',              'title': 'تیم گلکسی 11 - زومیت', +            'age_limit': 0,          },          # 'skip': 'Extremely unreliable',      } @@ -34,7 +35,8 @@ class AparatIE(InfoExtractor):                       video_id + '/vt/frame')          webpage = self._download_webpage(embed_url, video_id) -        video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) +        video_urls = [video_url.replace('\\/', '/') for video_url in re.findall( +            r'(?:fileList\[[0-9]+\]\s*=|"file"\s*:)\s*"([^"]+)"', webpage)]          for i, video_url in enumerate(video_urls):              req = HEADRequest(video_url)              res = self._request_webpage( @@ -46,7 +48,7 @@ class AparatIE(InfoExtractor):          title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title')          thumbnail = self._search_regex( -            r'\s+image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) +            r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)          return {              'id': video_id, @@ -54,4 +56,5 @@ class AparatIE(InfoExtractor):              'url': video_url,              'ext': 'mp4',              'thumbnail': thumbnail, +            'age_limit': self._family_friendly_search(webpage),          } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 602601b24..2f5ba7aee 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -656,6 +656,21 @@ class InfoExtractor(object):          }          return RATING_TABLE.get(rating.lower(), None) +    def _family_friendly_search(self, html): +        # See http://schema.org/VideoObj +        family_friendly = self._html_search_meta('isFamilyFriendly', html) + +        if not family_friendly: +            return None + +        RATING_TABLE = { +            '1': 0, +            'true': 0, +            '0': 18, +            'false': 18, +        } +        return RATING_TABLE.get(family_friendly.lower(), None) +      def _twitter_search_player(self, html):          return self._html_search_meta('twitter:player', html,                                        'twitter card player') @@ -707,9 +722,9 @@ class InfoExtractor(object):                  f.get('quality') if f.get('quality') is not None else -1,                  f.get('tbr') if f.get('tbr') is not None else -1,                  f.get('vbr') if f.get('vbr') is not None else -1, -                ext_preference,                  f.get('height') if f.get('height') is not None else -1,                  f.get('width') if f.get('width') is not None else -1, +                ext_preference,                  f.get('abr') if f.get('abr') is not None else -1,                  audio_ext_preference,                  f.get('fps') if f.get('fps') is not None else -1, @@ -765,7 +780,7 @@ class InfoExtractor(object):          self.to_screen(msg)          time.sleep(timeout) -    def _extract_f4m_formats(self, manifest_url, video_id): +    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):          manifest = self._download_xml(              manifest_url, video_id, 'Downloading f4m manifest',              'Unable to download f4m manifest') @@ -778,26 +793,28 @@ class InfoExtractor(object):              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')          for i, media_el in enumerate(media_nodes):              if manifest_version == '2.0': -                manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href') +                manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +                                + (media_el.attrib.get('href') or media_el.attrib.get('url')))              tbr = int_or_none(media_el.attrib.get('bitrate')) -            format_id = 'f4m-%d' % (i if tbr is None else tbr)              formats.append({ -                'format_id': format_id, +                'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),                  'url': manifest_url,                  'ext': 'flv',                  'tbr': tbr,                  'width': int_or_none(media_el.attrib.get('width')),                  'height': int_or_none(media_el.attrib.get('height')), +                'preference': preference,              })          self._sort_formats(formats)          return formats      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, -                              entry_protocol='m3u8', preference=None): +                              entry_protocol='m3u8', preference=None, +                              m3u8_id=None):          formats = [{ -            'format_id': 'm3u8-meta', +            'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),              'url': m3u8_url,              'ext': ext,              'protocol': 'm3u8', @@ -833,9 +850,8 @@ class InfoExtractor(object):                      formats.append({'url': format_url(line)})                      continue                  tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) -                  f = { -                    'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), +                    'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),                      'url': format_url(line.strip()),                      'tbr': tbr,                      'ext': ext, diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py index b116d251d..1d9166455 100644 --- a/youtube_dl/extractor/goshgay.py +++ b/youtube_dl/extractor/goshgay.py @@ -34,8 +34,6 @@ class GoshgayIE(InfoExtractor):          duration = parse_duration(self._html_search_regex(              r'<span class="duration">\s*-?\s*(.*?)</span>',              webpage, 'duration', fatal=False)) -        family_friendly = self._html_search_meta( -            'isFamilyFriendly', webpage, default='false')          flashvars = compat_parse_qs(self._html_search_regex(              r'<embed.+?id="flash-player-embed".+?flashvars="([^"]+)"', @@ -49,5 +47,5 @@ class GoshgayIE(InfoExtractor):              'title': title,              'thumbnail': thumbnail,              'duration': duration, -            'age_limit': 0 if family_friendly == 'true' else 18, +            'age_limit': self._family_friendly_search(webpage),          } diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index d16d483ee..99a1361f8 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -80,9 +80,6 @@ class IzleseneIE(InfoExtractor):              r'comment_count\s*=\s*\'([^\']+)\';',              webpage, 'comment_count', fatal=False) -        family_friendly = self._html_search_meta( -            'isFamilyFriendly', webpage, 'age limit', fatal=False) -          content_url = self._html_search_meta(              'contentURL', webpage, 'content URL', fatal=False)          ext = determine_ext(content_url, 'mp4') @@ -120,6 +117,6 @@ class IzleseneIE(InfoExtractor):              'duration': duration,              'view_count': int_or_none(view_count),              'comment_count': int_or_none(comment_count), -            'age_limit': 18 if family_friendly == 'False' else 0, +            'age_limit': self._family_friendly_search(webpage),              'formats': formats,          } diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 7736cabba..ecf4939cd 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -1,16 +1,16 @@  # coding: utf-8  from __future__ import unicode_literals -import json +import re  from .common import InfoExtractor -from ..utils import js_to_json  class RTPIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'      _TESTS = [{          'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', +        'md5': 'e736ce0c665e459ddb818546220b4ef8',          'info_dict': {              'id': 'e174042',              'ext': 'mp3', @@ -18,9 +18,6 @@ class RTPIE(InfoExtractor):              'description': 'As paixões musicais de António Cartaxo e António Macedo',              'thumbnail': 're:^https?://.*\.jpg',          }, -        'params': { -            'skip_download': True,  # RTMP download -        },      }, {          'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',          'only_matching': True, @@ -37,20 +34,48 @@ class RTPIE(InfoExtractor):          player_config = self._search_regex(              r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config') -        config = json.loads(js_to_json(player_config)) +        config = self._parse_json(player_config, video_id)          path, ext = config.get('file').rsplit('.', 1)          formats = [{ +            'format_id': 'rtmp', +            'ext': ext, +            'vcodec': config.get('type') == 'audio' and 'none' or None, +            'preference': -2, +            'url': 'rtmp://{streamer:s}/{application:s}'.format(**config),              'app': config.get('application'),              'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path),              'page_url': url, -            'url': 'rtmp://{streamer:s}/{application:s}'.format(**config),              'rtmp_live': config.get('live', False), -            'ext': ext, -            'vcodec': config.get('type') == 'audio' and 'none' or None,              'player_url': 'http://programas.rtp.pt/play/player.swf?v3', +            'rtmp_real_time': True,          }] +        # Construct regular HTTP download URLs +        replacements = { +            'audio': { +                'format_id': 'mp3', +                'pattern': r'^nas2\.share/wavrss/', +                'repl': 'http://rsspod.rtp.pt/podcasts/', +                'vcodec': 'none', +            }, +            'video': { +                'format_id': 'mp4_h264', +                'pattern': r'^nas2\.share/h264/', +                'repl': 'http://rsspod.rtp.pt/videocasts/', +                'vcodec': 'h264', +            }, +        } +        r = replacements[config['type']] +        if re.match(r['pattern'], config['file']) is not None: +            formats.append({ +                'format_id': r['format_id'], +                'url': re.sub(r['pattern'], r['repl'], config['file']), +                'vcodec': r['vcodec'], +            }) + +        self._sort_formats(formats) +          return {              'id': video_id,              'title': title, diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 5e84c1098..d0981115d 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -6,12 +6,14 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_str, +    compat_urllib_parse_urlparse,  )  from ..utils import (      int_or_none,      parse_duration,      parse_iso8601,      unescapeHTML, +    xpath_text,  ) @@ -159,11 +161,27 @@ class RTSIE(InfoExtractor):              return int_or_none(self._search_regex(                  r'-([0-9]+)k\.', url, 'bitrate', default=None)) -        formats = [{ -            'format_id': fid, -            'url': furl, -            'tbr': extract_bitrate(furl), -        } for fid, furl in info['streams'].items()] +        formats = [] +        for format_id, format_url in info['streams'].items(): +            if format_url.endswith('.f4m'): +                token = self._download_xml( +                    'http://tp.srgssr.ch/token/akahd.xml?stream=%s/*' % compat_urllib_parse_urlparse(format_url).path, +                    video_id, 'Downloading %s token' % format_id) +                auth_params = xpath_text(token, './/authparams', 'auth params') +                if not auth_params: +                    continue +                formats.extend(self._extract_f4m_formats( +                    '%s?%s&hdcore=3.4.0&plugin=aasp-3.4.0.132.66' % (format_url, auth_params), +                    video_id, f4m_id=format_id)) +            elif format_url.endswith('.m3u8'): +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', m3u8_id=format_id)) +            else: +                formats.append({ +                    'format_id': format_id, +                    'url': format_url, +                    'tbr': extract_bitrate(format_url), +                })          if 'media' in info:              formats.extend([{ diff --git a/youtube_dl/extractor/soulanime.py b/youtube_dl/extractor/soulanime.py deleted file mode 100644 index feef33e27..000000000 --- a/youtube_dl/extractor/soulanime.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( -    HEADRequest, -    urlhandle_detect_ext, -) - - -class SoulAnimeWatchingIE(InfoExtractor): -    IE_NAME = "soulanime:watching" -    IE_DESC = "SoulAnime video" -    _TEST = { -        'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/', -        'md5': '05fae04abf72298098b528e98abf4298', -        'info_dict': { -            'id': 'seirei-tsukai-no-blade-dance-episode-9', -            'ext': 'mp4', -            'title': 'seirei-tsukai-no-blade-dance-episode-9', -            'description': 'seirei-tsukai-no-blade-dance-episode-9' -        } -    } -    _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/watch[^/]*/(?P<id>[^/]+)' - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        domain = mobj.group('domain') - -        page = self._download_webpage(url, video_id) - -        video_url_encoded = self._html_search_regex( -            r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"', page, 'url') -        video_url = "http://www.soul-anime." + domain + video_url_encoded - -        ext_req = HEADRequest(video_url) -        ext_handle = self._request_webpage( -            ext_req, video_id, note='Determining extension') -        ext = urlhandle_detect_ext(ext_handle) - -        return { -            'id': video_id, -            'url': video_url, -            'ext': ext, -            'title': video_id, -            'description': video_id -        } - - -class SoulAnimeSeriesIE(InfoExtractor): -    IE_NAME = "soulanime:series" -    IE_DESC = "SoulAnime Series" - -    _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/anime./(?P<id>[^/]+)' - -    _EPISODE_REGEX = r'<option value="(/watch[^/]*/[^"]+)">[^<]*</option>' - -    _TEST = { -        'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/', -        'info_dict': { -            'id': 'black-rock-shooter-tv' -        }, -        'playlist_count': 8 -    } - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        series_id = mobj.group('id') -        domain = mobj.group('domain') - -        pattern = re.compile(self._EPISODE_REGEX) - -        page = self._download_webpage(url, series_id, "Downloading series page") -        mobj = pattern.findall(page) - -        entries = [self.url_result("http://www.soul-anime." + domain + obj) for obj in mobj] - -        return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 18a823719..e85d452a3 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -15,7 +15,8 @@ class TeamcocoIE(InfoExtractor):                  'id': '80187',                  'ext': 'mp4',                  'title': 'Conan Becomes A Mary Kay Beauty Consultant', -                'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.' +                'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', +                'age_limit': 0,              }          }, {              'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -24,7 +25,8 @@ class TeamcocoIE(InfoExtractor):                  'id': '19705',                  'ext': 'mp4',                  "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.", -                "title": "Louis C.K. Interview Pt. 1 11/3/11" +                "title": "Louis C.K. Interview Pt. 1 11/3/11", +                'age_limit': 0,              }          }      ] @@ -83,4 +85,5 @@ class TeamcocoIE(InfoExtractor):              'title': self._og_search_title(webpage),              'thumbnail': self._og_search_thumbnail(webpage),              'description': self._og_search_description(webpage), +            'age_limit': self._family_friendly_search(webpage),          } diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index ba65996dc..102362b29 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -1,6 +1,8 @@  # encoding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      float_or_none, @@ -11,7 +13,7 @@ from ..utils import (  class TvigleIE(InfoExtractor):      IE_NAME = 'tvigle'      IE_DESC = 'Интернет-телевидение Tvigle.ru' -    _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<id>[^/]+)/$' +    _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))'      _TESTS = [          { @@ -38,16 +40,22 @@ class TvigleIE(InfoExtractor):                  'duration': 186.080,                  'age_limit': 0,              }, -        }, +        }, { +            'url': 'https://cloud.tvigle.ru/video/5267604/', +            'only_matching': True, +        }      ]      def _real_extract(self, url): -        display_id = self._match_id(url) - -        webpage = self._download_webpage(url, display_id) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') -        video_id = self._html_search_regex( -            r'<li class="video-preview current_playing" id="(\d+)">', webpage, 'video id') +        if not video_id: +            webpage = self._download_webpage(url, display_id) +            video_id = self._html_search_regex( +                r'<li class="video-preview current_playing" id="(\d+)">', +                webpage, 'video id')          video_data = self._download_json(              'http://cloud.tvigle.ru/api/play/video/%s/' % video_id, display_id) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index e332d4694..c80ec15cf 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -1,35 +1,65 @@ -# coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    xpath_with_ns, +    int_or_none, +    float_or_none, +)  class TweakersIE(InfoExtractor): -    _VALID_URL = r'https?://tweakers\.net/video/(?P<id>[0-9]+).*' +    _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'      _TEST = {          'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', -        'md5': 'f7f7f3027166a7f32f024b4ae6571ced', +        'md5': '1b5afa817403bb5baa08359dca31e6df',          'info_dict': {              'id': '9926',              'ext': 'mp4', -            'title': 'New-Nintendo-3Ds-Xl-Op-Alle-Fronten-Beter', +            'title': 'New Nintendo 3DS XL - Op alle fronten beter', +            'description': 'md5:f97324cc71e86e11c853f0763820e3ba', +            'thumbnail': 're:^https?://.*\.jpe?g$', +            'duration': 386,          }      }      def _real_extract(self, url): -        splitted_url = re.split('.html|/', url) -        del splitted_url[-1]  # To remove extra '/' at the end          video_id = self._match_id(url) -        title = splitted_url[5].title()  # Retrieve title for URL and capitalize -        splitted_url[3] = splitted_url[3] + '/player'  # Add /player to get the player page -        player_url = '/'.join(splitted_url) + '.html' -        player_page = self._download_webpage(player_url, video_id) + +        playlist = self._download_xml( +            'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % video_id, +            video_id) + +        NS_MAP = { +            'xspf': 'http://xspf.org/ns/0/', +            's1': 'http://static.streamone.nl/player/ns/0', +        } + +        track = playlist.find(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)) + +        title = xpath_text( +            track, xpath_with_ns('./xspf:title', NS_MAP), 'title') +        description = xpath_text( +            track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') +        thumbnail = xpath_text( +            track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') +        duration = float_or_none( +            xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), +            1000) + +        formats = [{ +            'url': location.text, +            'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), +            'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), +            'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), +        } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]          return {              'id': video_id, -            'ext': 'mp4',              'title': title, -            'url': re.findall('http.*mp4', player_page)[0], +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats,          } | 
