diff options
| author | Remita Amine <remitamine@gmail.com> | 2018-05-10 08:19:32 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2018-05-10 08:19:56 +0100 | 
| commit | ff8889cd4dfae0ae3758e3d8a496f5724f6dc092 (patch) | |
| tree | ec82c0512b2973c94d1d0eff36fbc363f79ef546 | |
| parent | 9e18bb4c67af7b748ee62247d751c0e705aa791a (diff) | |
[teamcoco] fix extraction(closes #16374)
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 179 | 
1 files changed, 82 insertions, 97 deletions
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9056c8cbc..f06e5b19a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,35 +1,34 @@  # coding: utf-8  from __future__ import unicode_literals -import binascii -import re  import json  from .common import InfoExtractor -from ..compat import ( -    compat_b64decode, -    compat_ord, -)  from ..utils import ( +    determine_ext,      ExtractorError, +    int_or_none, +    mimetype2ext, +    parse_duration, +    parse_iso8601,      qualities, -    determine_ext,  )  class TeamcocoIE(InfoExtractor): -    _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' +    _VALID_URL = r'https?://teamcoco\.com/video/(?P<id>[^/?#]+)'      _TESTS = [          { -            'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', -            'md5': '3f7746aa0dc86de18df7539903d399ea', +            'url': 'http://teamcoco.com/video/mary-kay-remote', +            'md5': '55d532f81992f5c92046ad02fec34d7d',              'info_dict': {                  'id': '80187',                  'ext': 'mp4',                  'title': 'Conan Becomes A Mary Kay Beauty Consultant',                  'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', -                'duration': 504, -                'age_limit': 0, +                'duration': 495.0, +                'upload_date': '20140402', +                'timestamp': 1396407600,              }          }, {              'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -40,7 +39,8 @@ class TeamcocoIE(InfoExtractor):                  'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.',                  'title': 'Louis C.K. Interview Pt. 1 11/3/11',                  'duration': 288, -                'age_limit': 0, +                'upload_date': '20111104', +                'timestamp': 1320405840,              }          }, {              'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -49,6 +49,8 @@ class TeamcocoIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Timothy Olyphant Raises A Toast To “Justified”',                  'description': 'md5:15501f23f020e793aeca761205e42c24', +                'upload_date': '20150415', +                'timestamp': 1429088400,              },              'params': {                  'skip_download': True,  # m3u8 downloads @@ -63,110 +65,93 @@ class TeamcocoIE(InfoExtractor):              },              'params': {                  'skip_download': True,  # m3u8 downloads -            } +            }, +            'skip': 'This video is no longer available.',          }      ] -    _VIDEO_ID_REGEXES = ( -        r'"eVar42"\s*:\s*(\d+)', -        r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', -        r'"id_not"\s*:\s*(\d+)' -    ) -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) +    def _graphql_call(self, query_template, object_type, object_id): +        find_object = 'find' + object_type +        return self._download_json( +            'http://teamcoco.com/graphql/', object_id, data=json.dumps({ +                'query': query_template % (find_object, object_id) +            }))['data'][find_object] -        display_id = mobj.group('display_id') -        webpage, urlh = self._download_webpage_handle(url, display_id) -        if 'src=expired' in urlh.geturl(): -            raise ExtractorError('This video is expired.', expected=True) - -        video_id = mobj.group('video_id') -        if not video_id: -            video_id = self._html_search_regex( -                self._VIDEO_ID_REGEXES, webpage, 'video id') - -        data = None - -        preload_codes = self._html_search_regex( -            r'(function.+)setTimeout\(function\(\)\{playlist', -            webpage, 'preload codes') -        base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) -        base64_fragments.remove('init') - -        def _check_sequence(cur_fragments): -            if not cur_fragments: -                return -            for i in range(len(cur_fragments)): -                cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') -                try: -                    raw_data = compat_b64decode(cur_sequence) -                    if compat_ord(raw_data[0]) == compat_ord('{'): -                        return json.loads(raw_data.decode('utf-8')) -                except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): -                    continue - -        def _check_data(): -            for i in range(len(base64_fragments) + 1): -                for j in range(i, len(base64_fragments) + 1): -                    data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) -                    if data: -                        return data - -        self.to_screen('Try to compute possible data sequence. This may take some time.') -        data = _check_data() - -        if not data: -            raise ExtractorError( -                'Preload information could not be extracted', expected=True) +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        response = self._graphql_call('''{ +  %s(slug: "video/%s") { +    ... on RecordSlug { +      record { +        id +        title +        teaser +        publishOn +        thumb { +          preview +        } +        tags { +          name +        } +        duration +      } +    } +    ... on NotFoundSlug { +      status +    } +  } +}''', 'Slug', display_id) +        if response.get('status'): +            raise ExtractorError('This video is no longer available.', expected=True) + +        record = response['record'] +        video_id = record['id'] + +        srcs = self._graphql_call('''{ +  %s(id: "%s") { +    src +  } +}''', 'RecordVideoSource', video_id)['src']          formats = [] -        get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) -        for filed in data['files']: -            if determine_ext(filed['url']) == 'm3u8': -                # compat_urllib_parse.urljoin does not work here -                if filed['url'].startswith('/'): -                    m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] -                else: -                    m3u8_url = filed['url'] -                m3u8_formats = self._extract_m3u8_formats( -                    m3u8_url, video_id, ext='mp4') -                for m3u8_format in m3u8_formats: -                    if m3u8_format not in formats: -                        formats.append(m3u8_format) -            elif determine_ext(filed['url']) == 'f4m': -                # TODO Correct f4m extraction +        get_quality = qualities(['low', 'sd', 'hd', 'uhd']) +        for format_id, src in srcs.items(): +            if not isinstance(src, dict): +                continue +            src_url = src.get('src') +            if not src_url:                  continue +            ext = determine_ext(src_url, mimetype2ext(src.get('type'))) +            if format_id == 'hls' or ext == 'm3u8': +                # compat_urllib_parse.urljoin does not work here +                if src_url.startswith('/'): +                    src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url +                formats.extend(self._extract_m3u8_formats( +                    src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))              else: -                if filed['url'].startswith('/mp4:protected/'): +                if src_url.startswith('/mp4:protected/'):                      # TODO Correct extraction for these files                      continue -                m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) -                if m_format is not None: -                    format_id = m_format.group(1) -                else: -                    format_id = filed['bitrate'] -                tbr = ( -                    int(filed['bitrate']) -                    if filed['bitrate'].isdigit() -                    else None) +                tbr = int_or_none(self._search_regex( +                    r'(\d+)k\.mp4', src_url, 'tbr', default=None))                  formats.append({ -                    'url': filed['url'], -                    'ext': 'mp4', +                    'url': src_url, +                    'ext': ext,                      'tbr': tbr,                      'format_id': format_id,                      'quality': get_quality(format_id),                  }) -          self._sort_formats(formats)          return {              'id': video_id,              'display_id': display_id,              'formats': formats, -            'title': data['title'], -            'thumbnail': data.get('thumb', {}).get('href'), -            'description': data.get('teaser'), -            'duration': data.get('duration'), -            'age_limit': self._family_friendly_search(webpage), +            'title': record['title'], +            'thumbnail': record.get('thumb', {}).get('preview'), +            'description': record.get('teaser'), +            'duration': parse_duration(record.get('duration')), +            'timestamp': parse_iso8601(record.get('publishOn')),          }  | 
