diff options
Diffstat (limited to 'youtube_dl/extractor/dailymotion.py')
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 124 | 
1 files changed, 83 insertions, 41 deletions
| diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8f5f57b98..040f0bd02 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  import base64 +import functools  import hashlib  import itertools  import json @@ -16,11 +17,13 @@ from ..utils import (      error_to_compat_str,      ExtractorError,      int_or_none, +    mimetype2ext, +    OnDemandPagedList,      parse_iso8601,      sanitized_Request,      str_to_int,      unescapeHTML, -    mimetype2ext, +    urlencode_postdata,  ) @@ -343,58 +346,73 @@ class DailymotionIE(DailymotionBaseInfoExtractor):  class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):      IE_NAME = 'dailymotion:playlist' -    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' -    _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' -    _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' +    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'      _TESTS = [{          'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',          'info_dict': {              'title': 'SPORT', -            'id': 'xv4bw_nqtv_sport', +            'id': 'xv4bw',          },          'playlist_mincount': 20,      }] - -    def _extract_entries(self, id): -        video_ids = set() -        processed_urls = set() -        for pagenum in itertools.count(1): -            page_url = self._PAGE_TEMPLATE % (id, pagenum) -            webpage, urlh = self._download_webpage_handle_no_ff( -                page_url, id, 'Downloading page %s' % pagenum) -            if urlh.geturl() in processed_urls: -                self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( -                    page_url, urlh.geturl()), id) -                break - -            processed_urls.add(urlh.geturl()) - -            for video_id in re.findall(r'data-xid="(.+?)"', webpage): -                if video_id not in video_ids: -                    yield self.url_result( -                        'http://www.dailymotion.com/video/%s' % video_id, -                        DailymotionIE.ie_key(), video_id) -                    video_ids.add(video_id) - -            if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: -                break +    _PAGE_SIZE = 100 + +    def _fetch_page(self, playlist_id, authorizaion, page): +        page += 1 +        videos = self._download_json( +            'https://graphql.api.dailymotion.com', +            playlist_id, 'Downloading page %d' % page, +            data=json.dumps({ +                'query': '''{ +  collection(xid: "%s") { +    videos(first: %d, page: %d) { +      pageInfo { +        hasNextPage +        nextPage +      } +      edges { +        node { +          xid +          url +        } +      } +    } +  } +}''' % (playlist_id, self._PAGE_SIZE, page) +            }).encode(), headers={ +                'Authorization': authorizaion, +                'Origin': 'https://www.dailymotion.com', +            })['data']['collection']['videos'] +        for edge in videos['edges']: +            node = edge['node'] +            yield self.url_result( +                node['url'], DailymotionIE.ie_key(), node['xid'])      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        playlist_id = mobj.group('id') +        playlist_id = self._match_id(url)          webpage = self._download_webpage(url, playlist_id) - -        return { -            '_type': 'playlist', -            'id': playlist_id, -            'title': self._og_search_title(webpage), -            'entries': self._extract_entries(playlist_id), -        } - - -class DailymotionUserIE(DailymotionPlaylistIE): +        api = self._parse_json(self._search_regex( +            r'__PLAYER_CONFIG__\s*=\s*({.+?});', +            webpage, 'player config'), playlist_id)['context']['api'] +        auth = self._download_json( +            api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), +            playlist_id, data=urlencode_postdata({ +                'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), +                'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), +                'grant_type': 'client_credentials', +            })) +        authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) +        entries = OnDemandPagedList(functools.partial( +            self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) +        return self.playlist_result( +            entries, playlist_id, +            self._og_search_title(webpage)) + + +class DailymotionUserIE(DailymotionBaseInfoExtractor):      IE_NAME = 'dailymotion:user'      _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' +    _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'      _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'      _TESTS = [{          'url': 'https://www.dailymotion.com/user/nqtv', @@ -416,6 +434,30 @@ class DailymotionUserIE(DailymotionPlaylistIE):          'skip': 'Takes too long time',      }] +    def _extract_entries(self, id): +        video_ids = set() +        processed_urls = set() +        for pagenum in itertools.count(1): +            page_url = self._PAGE_TEMPLATE % (id, pagenum) +            webpage, urlh = self._download_webpage_handle_no_ff( +                page_url, id, 'Downloading page %s' % pagenum) +            if urlh.geturl() in processed_urls: +                self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( +                    page_url, urlh.geturl()), id) +                break + +            processed_urls.add(urlh.geturl()) + +            for video_id in re.findall(r'data-xid="(.+?)"', webpage): +                if video_id not in video_ids: +                    yield self.url_result( +                        'http://www.dailymotion.com/video/%s' % video_id, +                        DailymotionIE.ie_key(), video_id) +                    video_ids.add(video_id) + +            if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: +                break +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          user = mobj.group('user') | 
