aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRemita Amine <remitamine@gmail.com>2018-07-28 20:29:56 +0100
committerRemita Amine <remitamine@gmail.com>2018-07-28 20:30:44 +0100
commitec240a43696478e43abb15e7c91f067b2bd5fe08 (patch)
tree66d473e8c6be8a587908fa1deb11a18a6cebaf9a
parentcd3a3ff93bd5d6866d3822cb438b0e172ffe4e39 (diff)
[dailymotion:playlist] fix extraction(closes #16894)
-rw-r--r--youtube_dl/extractor/dailymotion.py124
1 files changed, 83 insertions, 41 deletions
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 8f5f57b98..040f0bd02 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import base64
+import functools
import hashlib
import itertools
import json
@@ -16,11 +17,13 @@ from ..utils import (
error_to_compat_str,
ExtractorError,
int_or_none,
+ mimetype2ext,
+ OnDemandPagedList,
parse_iso8601,
sanitized_Request,
str_to_int,
unescapeHTML,
- mimetype2ext,
+ urlencode_postdata,
)
@@ -343,58 +346,73 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:playlist'
- _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)'
- _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
- _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
_TESTS = [{
'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
'info_dict': {
'title': 'SPORT',
- 'id': 'xv4bw_nqtv_sport',
+ 'id': 'xv4bw',
},
'playlist_mincount': 20,
}]
-
- def _extract_entries(self, id):
- video_ids = set()
- processed_urls = set()
- for pagenum in itertools.count(1):
- page_url = self._PAGE_TEMPLATE % (id, pagenum)
- webpage, urlh = self._download_webpage_handle_no_ff(
- page_url, id, 'Downloading page %s' % pagenum)
- if urlh.geturl() in processed_urls:
- self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
- page_url, urlh.geturl()), id)
- break
-
- processed_urls.add(urlh.geturl())
-
- for video_id in re.findall(r'data-xid="(.+?)"', webpage):
- if video_id not in video_ids:
- yield self.url_result(
- 'http://www.dailymotion.com/video/%s' % video_id,
- DailymotionIE.ie_key(), video_id)
- video_ids.add(video_id)
-
- if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
- break
+ _PAGE_SIZE = 100
+
+ def _fetch_page(self, playlist_id, authorizaion, page):
+ page += 1
+ videos = self._download_json(
+ 'https://graphql.api.dailymotion.com',
+ playlist_id, 'Downloading page %d' % page,
+ data=json.dumps({
+ 'query': '''{
+ collection(xid: "%s") {
+ videos(first: %d, page: %d) {
+ pageInfo {
+ hasNextPage
+ nextPage
+ }
+ edges {
+ node {
+ xid
+ url
+ }
+ }
+ }
+ }
+}''' % (playlist_id, self._PAGE_SIZE, page)
+ }).encode(), headers={
+ 'Authorization': authorizaion,
+ 'Origin': 'https://www.dailymotion.com',
+ })['data']['collection']['videos']
+ for edge in videos['edges']:
+ node = edge['node']
+ yield self.url_result(
+ node['url'], DailymotionIE.ie_key(), node['xid'])
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
-
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': self._og_search_title(webpage),
- 'entries': self._extract_entries(playlist_id),
- }
-
-
-class DailymotionUserIE(DailymotionPlaylistIE):
+ api = self._parse_json(self._search_regex(
+ r'__PLAYER_CONFIG__\s*=\s*({.+?});',
+ webpage, 'player config'), playlist_id)['context']['api']
+ auth = self._download_json(
+ api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
+ playlist_id, data=urlencode_postdata({
+ 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
+ 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
+ 'grant_type': 'client_credentials',
+ }))
+ authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
+ return self.playlist_result(
+ entries, playlist_id,
+ self._og_search_title(webpage))
+
+
+class DailymotionUserIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
+ _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
@@ -416,6 +434,30 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'skip': 'Takes too long time',
}]
+ def _extract_entries(self, id):
+ video_ids = set()
+ processed_urls = set()
+ for pagenum in itertools.count(1):
+ page_url = self._PAGE_TEMPLATE % (id, pagenum)
+ webpage, urlh = self._download_webpage_handle_no_ff(
+ page_url, id, 'Downloading page %s' % pagenum)
+ if urlh.geturl() in processed_urls:
+ self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
+ page_url, urlh.geturl()), id)
+ break
+
+ processed_urls.add(urlh.geturl())
+
+ for video_id in re.findall(r'data-xid="(.+?)"', webpage):
+ if video_id not in video_ids:
+ yield self.url_result(
+ 'http://www.dailymotion.com/video/%s' % video_id,
+ DailymotionIE.ie_key(), video_id)
+ video_ids.add(video_id)
+
+ if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
+ break
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')