aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/amcnetworks.py8
-rw-r--r--youtube_dl/extractor/aparat.py49
-rw-r--r--youtube_dl/extractor/ard.py14
-rw-r--r--youtube_dl/extractor/arte.py12
-rw-r--r--youtube_dl/extractor/bandcamp.py7
-rw-r--r--youtube_dl/extractor/bbc.py3
-rwxr-xr-xyoutube_dl/extractor/cda.py2
-rw-r--r--youtube_dl/extractor/cinchcast.py14
-rw-r--r--youtube_dl/extractor/clipfish.py67
-rw-r--r--youtube_dl/extractor/clippit.py74
-rw-r--r--youtube_dl/extractor/cloudy.py6
-rw-r--r--youtube_dl/extractor/common.py33
-rw-r--r--youtube_dl/extractor/dispeak.py6
-rw-r--r--youtube_dl/extractor/dplay.py66
-rw-r--r--youtube_dl/extractor/dramafever.py17
-rw-r--r--youtube_dl/extractor/egghead.py49
-rw-r--r--youtube_dl/extractor/extractors.py34
-rw-r--r--youtube_dl/extractor/fourtube.py174
-rw-r--r--youtube_dl/extractor/funnyordie.py64
-rw-r--r--youtube_dl/extractor/generic.py101
-rw-r--r--youtube_dl/extractor/itv.py8
-rw-r--r--youtube_dl/extractor/laola1tv.py18
-rw-r--r--youtube_dl/extractor/limelight.py19
-rw-r--r--youtube_dl/extractor/liveleak.py78
-rw-r--r--youtube_dl/extractor/megaphone.py55
-rw-r--r--youtube_dl/extractor/mixcloud.py48
-rw-r--r--youtube_dl/extractor/mlb.py8
-rw-r--r--youtube_dl/extractor/mpora.py62
-rw-r--r--youtube_dl/extractor/mtv.py31
-rw-r--r--youtube_dl/extractor/nick.py24
-rw-r--r--youtube_dl/extractor/niconico.py266
-rw-r--r--youtube_dl/extractor/npo.py7
-rw-r--r--youtube_dl/extractor/nrk.py2
-rw-r--r--youtube_dl/extractor/pbs.py20
-rw-r--r--youtube_dl/extractor/periscope.py20
-rw-r--r--youtube_dl/extractor/pluralsight.py53
-rw-r--r--youtube_dl/extractor/podomatic.py63
-rw-r--r--youtube_dl/extractor/pornhd.py2
-rw-r--r--youtube_dl/extractor/pornhub.py20
-rw-r--r--youtube_dl/extractor/qqmusic.py160
-rw-r--r--youtube_dl/extractor/reddit.py114
-rw-r--r--youtube_dl/extractor/soundcloud.py143
-rw-r--r--youtube_dl/extractor/spiegel.py20
-rw-r--r--youtube_dl/extractor/sportbox.py61
-rw-r--r--youtube_dl/extractor/svt.py3
-rw-r--r--youtube_dl/extractor/tbs.py9
-rw-r--r--youtube_dl/extractor/teamfourstar.py48
-rw-r--r--youtube_dl/extractor/twentymin.py2
-rw-r--r--youtube_dl/extractor/udemy.py49
-rw-r--r--youtube_dl/extractor/vh1.py12
-rw-r--r--youtube_dl/extractor/vidio.py3
-rw-r--r--youtube_dl/extractor/vidme.py34
-rw-r--r--youtube_dl/extractor/vlive.py7
-rw-r--r--youtube_dl/extractor/voot.py98
-rw-r--r--youtube_dl/extractor/vzaar.py8
-rw-r--r--youtube_dl/extractor/watchbox.py151
-rw-r--r--youtube_dl/extractor/xxxymovies.py4
-rw-r--r--youtube_dl/extractor/yandexdisk.py118
-rw-r--r--youtube_dl/extractor/youjizz.py78
-rw-r--r--youtube_dl/extractor/youku.py78
60 files changed, 2075 insertions, 729 deletions
diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py
index 3a0ec6776..dd3b18d72 100644
--- a/youtube_dl/extractor/amcnetworks.py
+++ b/youtube_dl/extractor/amcnetworks.py
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
from .theplatform import ThePlatformIE
from ..utils import (
- update_url_query,
- parse_age_limit,
int_or_none,
+ parse_age_limit,
+ try_get,
+ update_url_query,
)
@@ -68,7 +69,8 @@ class AMCNetworksIE(ThePlatformIE):
info = self._parse_theplatform_metadata(theplatform_metadata)
video_id = theplatform_metadata['pid']
title = theplatform_metadata['title']
- rating = theplatform_metadata['ratings'][0]['rating']
+ rating = try_get(
+ theplatform_metadata, lambda x: x['ratings'][0]['rating'])
auth_required = self._search_regex(
r'window\.authRequired\s*=\s*(true|false);',
webpage, 'auth required')
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py
index 025e29aa4..e394cb661 100644
--- a/youtube_dl/extractor/aparat.py
+++ b/youtube_dl/extractor/aparat.py
@@ -3,13 +3,13 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
- HEADRequest,
+ int_or_none,
+ mimetype2ext,
)
class AparatIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'http://www.aparat.com/v/wP8On',
@@ -29,30 +29,41 @@ class AparatIE(InfoExtractor):
# Note: There is an easier-to-parse configuration at
# http://www.aparat.com/video/video/config/videohash/%video_id
# but the URL in there does not work
- embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id
- webpage = self._download_webpage(embed_url, video_id)
-
- file_list = self._parse_json(self._search_regex(
- r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id)
- for i, item in enumerate(file_list[0]):
- video_url = item['file']
- req = HEADRequest(video_url)
- res = self._request_webpage(
- req, video_id, note='Testing video URL %d' % i, errnote=False)
- if res:
- break
- else:
- raise ExtractorError('No working video URLs found')
+ webpage = self._download_webpage(
+ 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
+ video_id)
title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title')
+
+ file_list = self._parse_json(
+ self._search_regex(
+ r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage,
+ 'file list'),
+ video_id)
+
+ formats = []
+ for item in file_list[0]:
+ file_url = item.get('file')
+ if not file_url:
+ continue
+ ext = mimetype2ext(item.get('type'))
+ label = item.get('label')
+ formats.append({
+ 'url': file_url,
+ 'ext': ext,
+ 'format_id': label or ext,
+ 'height': int_or_none(self._search_regex(
+ r'(\d+)[pP]', label or '', 'height', default=None)),
+ })
+ self._sort_formats(formats)
+
thumbnail = self._search_regex(
r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
'title': title,
- 'url': video_url,
- 'ext': 'mp4',
'thumbnail': thumbnail,
'age_limit': self._family_friendly_search(webpage),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 2d5599456..3f248b147 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor):
duration = int_or_none(media_info.get('_duration'))
thumbnail = media_info.get('_previewImage')
+ is_live = media_info.get('_isLive') is True
subtitles = {}
subtitle_url = media_info.get('_subtitleUrl')
@@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor):
'id': video_id,
'duration': duration,
'thumbnail': thumbnail,
+ 'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
}
@@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor):
# determine video id from url
m = re.match(self._VALID_URL, url)
+ document_id = None
+
numid = re.search(r'documentId=([0-9]+)', url)
if numid:
- video_id = numid.group(1)
+ document_id = video_id = numid.group(1)
else:
video_id = m.group('video_id')
@@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor):
'formats': formats,
}
else: # request JSON file
+ if not document_id:
+ video_id = self._search_regex(
+ r'/play/(?:config|media)/(\d+)', webpage, 'media id')
info = self._extract_media_info(
- 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
+ 'http://www.ardmediathek.de/play/media/%s' % video_id,
+ webpage, video_id)
info.update({
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if info.get('is_live') else title,
'description': description,
'thumbnail': thumbnail,
})
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 56baef29d..02613cf5d 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -9,12 +9,13 @@ from ..compat import (
compat_urllib_parse_urlparse,
)
from ..utils import (
+ ExtractorError,
find_xpath_attr,
- unified_strdate,
get_element_by_attribute,
int_or_none,
NO_DEFAULT,
qualities,
+ unified_strdate,
)
# There are different sources of video in arte.tv, the extraction process
@@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
+ vsr = player_info['VSR']
+
+ if not vsr and not player_info.get('VRU'):
+ raise ExtractorError(
+ 'Video %s is not available' % player_info.get('VID') or video_id,
+ expected=True)
+
upload_date_str = player_info.get('shootingDate')
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
@@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor):
langcode = LANGS.get(lang, lang)
formats = []
- for format_id, format_dict in player_info['VSR'].items():
+ for format_id, format_dict in vsr.items():
f = dict(format_dict)
versionCode = f.get('versionCode')
l = re.escape(langcode)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 9ddb9af17..be41bd5a2 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -242,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor):
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
- self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+ self.url_result(
+ compat_urlparse.urljoin(url, t_path),
+ ie=BandcampIE.ie_key(),
+ video_title=self._search_regex(
+ r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+ elem_content, 'track title', fatal=False))
for elem_content, t_path in track_elements
if self._html_search_meta('duration', elem_content, default=None)]
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 79ded6ba1..911ae6780 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -37,7 +37,8 @@ class BBCCoUkIE(InfoExtractor):
programmes/(?!articles/)|
iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]|
- radio/player/
+ radio/player/|
+ events/[^/]+/play/[^/]+/
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index 78b7a923c..0c3af23d5 100755
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -124,7 +124,7 @@ class CDAIE(InfoExtractor):
}
def extract_format(page, version):
- json_str = self._search_regex(
+ json_str = self._html_search_regex(
r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
'%s player_json' % version, fatal=False, group='player_data')
if not json_str:
diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py
index 562c9bbbb..b861d54b0 100644
--- a/youtube_dl/extractor/cinchcast.py
+++ b/youtube_dl/extractor/cinchcast.py
@@ -9,12 +9,20 @@ from ..utils import (
class CinchcastIE(InfoExtractor):
- _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
+ 'info_dict': {
+ 'id': '5258197',
+ 'ext': 'mp3',
+ 'title': 'Train Your Brain to Up Your Game with Coach Mandy',
+ 'upload_date': '20130816',
+ },
+ }, {
# Actual test is run in generic, look for undergroundwellness
'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
'only_matching': True,
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
deleted file mode 100644
index 0920f6219..000000000
--- a/youtube_dl/extractor/clipfish.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- unified_strdate,
-)
-
-
-class ClipfishIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
- 'md5': 'b9a5dc46294154c1193e2d10e0c95693',
- 'info_dict': {
- 'id': '4343170',
- 'ext': 'mp4',
- 'title': 'S01 E01 - Ugly Americans - Date in der Hölle',
- 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.',
- 'upload_date': '20161005',
- 'duration': 1291,
- 'view_count': int,
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- video_info = self._download_json(
- 'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id,
- video_id)['items'][0]
-
- formats = []
-
- m3u8_url = video_info.get('media_videourl_hls')
- if m3u8_url:
- formats.append({
- 'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
- 'ext': 'mp4',
- 'format_id': 'hls',
- })
-
- mp4_url = video_info.get('media_videourl')
- if mp4_url:
- formats.append({
- 'url': mp4_url,
- 'format_id': 'mp4',
- 'width': int_or_none(video_info.get('width')),
- 'height': int_or_none(video_info.get('height')),
- 'tbr': int_or_none(video_info.get('bitrate')),
- })
-
- descr = video_info.get('descr')
- if descr:
- descr = descr.strip()
-
- return {
- 'id': video_id,
- 'title': video_info['title'],
- 'description': descr,
- 'formats': formats,
- 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'),
- 'duration': int_or_none(video_info.get('media_length')),
- 'upload_date': unified_strdate(video_info.get('pubDate')),
- 'view_count': int_or_none(video_info.get('media_views'))
- }
diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py
new file mode 100644
index 000000000..a1a7a774c
--- /dev/null
+++ b/youtube_dl/extractor/clippit.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ qualities,
+)
+
+import re
+
+
+class ClippitIE(InfoExtractor):
+
+ _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)'
+ _TEST = {
+ 'url': 'https://www.clippituser.tv/c/evmgm',
+ 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09',
+ 'info_dict': {
+ 'id': 'evmgm',
+ 'ext': 'mp4',
+ 'title': 'Bye bye Brutus. #BattleBots - Clippit',
+ 'uploader': 'lizllove',
+ 'uploader_url': 'https://www.clippituser.tv/p/lizllove',
+ 'timestamp': 1472183818,
+ 'upload_date': '20160826',
+ 'description': 'BattleBots | ABC',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<title.*>(.+?)</title>', webpage, 'title')
+
+ FORMATS = ('sd', 'hd')
+ quality = qualities(FORMATS)
+ formats = []
+ for format_id in FORMATS:
+ url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id,
+ webpage, 'url', fatal=False)
+ if not url:
+ continue
+ match = re.search(r'/(?P<height>\d+)\.mp4', url)
+ formats.append({
+ 'url': url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ 'height': int(match.group('height')) if match else None,
+ })
+
+ uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n',
+ webpage, 'uploader', fatal=False)
+ uploader_url = ('https://www.clippituser.tv/p/' + uploader
+ if uploader else None)
+
+ timestamp = self._html_search_regex(r'datetime="(.+?)"',
+ webpage, 'date', fatal=False)
+ thumbnail = self._html_search_regex(r'data-image="(.+?)"',
+ webpage, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'uploader': uploader,
+ 'uploader_url': uploader_url,
+ 'timestamp': parse_iso8601(timestamp),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index 9bc8dbea4..85ca20ecc 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -30,7 +30,11 @@ class CloudyIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(
- 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id)
+ 'https://www.cloudy.ec/embed.php', video_id, query={
+ 'id': video_id,
+ 'playerPage': 1,
+ 'autoplay': 1,
+ })
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 748b4d59f..ceba4ca1c 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -940,7 +940,8 @@ class InfoExtractor(object):
def _family_friendly_search(self, html):
# See http://schema.org/VideoObject
- family_friendly = self._html_search_meta('isFamilyFriendly', html)
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', html, default=None)
if not family_friendly:
return None
@@ -1785,7 +1786,7 @@ class InfoExtractor(object):
ms_info['timescale'] = int(timescale)
segment_duration = source.get('duration')
if segment_duration:
- ms_info['segment_duration'] = int(segment_duration)
+ ms_info['segment_duration'] = float(segment_duration)
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
@@ -1892,9 +1893,13 @@ class InfoExtractor(object):
'Bandwidth': bandwidth,
}
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
@@ -1904,7 +1909,7 @@ class InfoExtractor(object):
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
- 'url': media_template % {
+ media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
@@ -1928,7 +1933,7 @@ class InfoExtractor(object):
'Number': segment_number,
}
representation_ms_info['fragments'].append({
- 'url': segment_url,
+ media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
@@ -1952,8 +1957,9 @@ class InfoExtractor(object):
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
fragments.append({
- 'url': representation_ms_info['segment_urls'][segment_index],
+ location_key(segment_uri): segment_uri,
'duration': duration,
})
segment_index += 1
@@ -1962,6 +1968,7 @@ class InfoExtractor(object):
# No fragments key is present in this case.
if 'fragments' in representation_ms_info:
f.update({
+ 'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
})
@@ -1969,10 +1976,8 @@ class InfoExtractor(object):
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
- f['fragments'].append({'url': initialization_url})
+ f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
- for fragment in f['fragments']:
- fragment['url'] = urljoin(base_url, fragment['url'])
try:
existing_format = next(
fo for fo in formats
@@ -2110,19 +2115,19 @@ class InfoExtractor(object):
return f
return {}
- def _media_formats(src, cur_media_type):
+ def _media_formats(src, cur_media_type, type_info={}):
full_url = absolute_url(src)
- ext = determine_ext(full_url)
+ ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
- preference=preference)
+ preference=preference, fatal=False)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
- full_url, video_id, mpd_id=mpd_id)
+ full_url, video_id, mpd_id=mpd_id, fatal=False)
else:
is_plain_url = True
formats = [{
@@ -2161,9 +2166,9 @@ class InfoExtractor(object):
src = source_attributes.get('src')
if not src:
continue
- is_plain_url, formats = _media_formats(src, media_type)
+ f = parse_content_type(source_attributes.get('type'))
+ is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
- f = parse_content_type(source_attributes.get('type'))
f.update(formats[0])
media_info['formats'].append(f)
else:
diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py
index a78cb8a2a..c05f601e2 100644
--- a/youtube_dl/extractor/dispeak.py
+++ b/youtube_dl/extractor/dispeak.py
@@ -13,7 +13,7 @@ from ..utils import (
class DigitallySpeakingIE(InfoExtractor):
- _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
+ _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
_TESTS = [{
# From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
@@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):
# From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
'only_matching': True,
+ }, {
+ # From http://www.gdcvault.com/play/1013700/Advanced-Material
+ 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
+ 'only_matching': True,
}]
def _parse_mp4(self, metadata):
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 1a41760f8..76e784105 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -7,16 +7,18 @@ import time
from .common import InfoExtractor
from ..compat import (
- compat_urlparse,
compat_HTTPError,
+ compat_str,
+ compat_urlparse,
)
from ..utils import (
- USER_AGENTS,
ExtractorError,
int_or_none,
- unified_strdate,
remove_end,
+ try_get,
+ unified_strdate,
update_url_query,
+ USER_AGENTS,
)
@@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- info_url = self._search_regex(
- r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
- webpage, 'video id')
-
title = remove_end(self._og_search_title(webpage), ' | Dplay')
- try:
- info = self._download_json(
- info_url, display_id, headers={
- 'Authorization': 'Bearer %s' % self._get_cookies(url).get(
- 'dplayit_token').value,
- 'Referer': url,
- })
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
- info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
- error = info['errors'][0]
- if error.get('code') == 'access.denied.geoblocked':
- self.raise_geo_restricted(
- msg=error.get('detail'), countries=self._GEO_COUNTRIES)
- raise ExtractorError(info['errors'][0]['detail'], expected=True)
- raise
+ video_id = None
+
+ info = self._search_regex(
+ r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")',
+ webpage, 'playback JSON', default=None)
+ if info:
+ for _ in range(2):
+ info = self._parse_json(info, display_id, fatal=False)
+ if not info:
+ break
+ else:
+ video_id = try_get(info, lambda x: x['data']['id'])
+
+ if not info:
+ info_url = self._search_regex(
+ r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
+ webpage, 'info url')
+
+ video_id = info_url.rpartition('/')[-1]
+
+ try:
+ info = self._download_json(
+ info_url, display_id, headers={
+ 'Authorization': 'Bearer %s' % self._get_cookies(url).get(
+ 'dplayit_token').value,
+ 'Referer': url,
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
+ info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
+ error = info['errors'][0]
+ if error.get('code') == 'access.denied.geoblocked':
+ self.raise_geo_restricted(
+ msg=error.get('detail'), countries=self._GEO_COUNTRIES)
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+ raise
hls_url = info['data']['attributes']['streaming']['hls']['url']
@@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor):
season_number = episode_number = upload_date = None
return {
- 'id': info_url.rpartition('/')[-1],
+ 'id': compat_str(video_id or display_id),
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
index e7abc8889..9a498d72a 100644
--- a/youtube_dl/extractor/dramafever.py
+++ b/youtube_dl/extractor/dramafever.py
@@ -12,6 +12,7 @@ from ..utils import (
ExtractorError,
clean_html,
int_or_none,
+ remove_end,
sanitized_Request,
urlencode_postdata
)
@@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE):
'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
'info_dict': {
'id': '4512.1',
- 'ext': 'mp4',
- 'title': 'Cooking with Shin 4512.1',
+ 'ext': 'flv',
+ 'title': 'Cooking with Shin',
'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
'episode': 'Episode 1',
'episode_number': 1,
'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1404336058,
'upload_date': '20140702',
- 'duration': 343,
+ 'duration': 344,
},
'params': {
# m3u8 download
@@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE):
'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1',
'info_dict': {
'id': '4826.4',
- 'ext': 'mp4',
- 'title': 'Mnet Asian Music Awards 2015 4826.4',
+ 'ext': 'flv',
+ 'title': 'Mnet Asian Music Awards 2015',
'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91',
'episode': 'Mnet Asian Music Awards 2015 - Part 3',
'episode_number': 4,
'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1450213200,
'upload_date': '20151215',
- 'duration': 5602,
+ 'duration': 5359,
},
'params': {
# m3u8 download
@@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE):
countries=self._GEO_COUNTRIES)
raise
+ # title is postfixed with video id for some reason, removing
+ if info.get('title'):
+ info['title'] = remove_end(info['title'], video_id).strip()
+
series_id, episode_number = video_id.split('.')
episode_info = self._download_json(
# We only need a single episode info, so restricting page size to one episode
diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py
index c86f52319..e4a3046af 100644
--- a/youtube_dl/extractor/egghead.py
+++ b/youtube_dl/extractor/egghead.py
@@ -2,6 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
class EggheadCourseIE(InfoExtractor):
@@ -33,3 +38,47 @@ class EggheadCourseIE(InfoExtractor):
return self.playlist_result(
entries, playlist_id, course.get('title'),
course.get('description'))
+
+
+class EggheadLessonIE(InfoExtractor):
+ IE_DESC = 'egghead.io lesson'
+ IE_NAME = 'egghead:lesson'
+ _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+ 'info_dict': {
+ 'id': 'fv5yotjxcg',
+ 'ext': 'mp4',
+ 'title': 'Create linear data flow with container style types (Box)',
+ 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
+ 'thumbnail': r're:^https?:.*\.jpg$',
+ 'timestamp': 1481296768,
+ 'upload_date': '20161209',
+ 'duration': 304,
+ 'view_count': 0,
+ 'tags': ['javascript', 'free'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ lesson_id = self._match_id(url)
+
+ lesson = self._download_json(
+ 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'Wistia',
+ 'url': 'wistia:%s' % lesson['wistia_id'],
+ 'id': lesson['wistia_id'],
+ 'title': lesson.get('title'),
+ 'description': lesson.get('summary'),
+ 'thumbnail': lesson.get('thumb_nail'),
+ 'timestamp': unified_timestamp(lesson.get('published_at')),
+ 'duration': int_or_none(lesson.get('duration')),
+ 'view_count': int_or_none(lesson.get('plays_count')),
+ 'tags': try_get(lesson, lambda x: x['tag_list'], list),
+ }
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index e8a066b83..17048fd6e 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -186,8 +186,8 @@ from .chirbit import (
)
from .cinchcast import CinchcastIE
from .cjsw import CJSWIE
-from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
+from .clippit import ClippitIE
from .cliprs import ClipRsIE
from .clipsyndicate import ClipsyndicateIE
from .closertotruth import CloserToTruthIE
@@ -298,7 +298,10 @@ from .dw import (
from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
-from .egghead import EggheadCourseIE
+from .egghead import (
+ EggheadCourseIE,
+ EggheadLessonIE,
+)
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
@@ -348,7 +351,12 @@ from .flipagram import FlipagramIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
from .formula1 import Formula1IE
-from .fourtube import FourTubeIE
+from .fourtube import (
+ FourTubeIE,
+ PornTubeIE,
+ PornerBrosIE,
+ FuxIE,
+)
from .fox import FOXIE
from .fox9 import FOX9IE
from .foxgay import FoxgayIE
@@ -501,6 +509,7 @@ from .la7 import LA7IE
from .laola1tv import (
Laola1TvEmbedIE,
Laola1TvIE,
+ ITTFIE,
)
from .lci import LCIIE
from .lcp import (
@@ -528,7 +537,10 @@ from .limelight import (
LimelightChannelListIE,
)
from .litv import LiTVIE
-from .liveleak import LiveLeakIE
+from .liveleak import (
+ LiveLeakIE,
+ LiveLeakEmbedIE,
+)
from .livestream import (
LivestreamIE,
LivestreamOriginalIE,
@@ -555,6 +567,7 @@ from .matchtv import MatchTVIE
from .mdr import MDRIE
from .mediaset import MediasetIE
from .medici import MediciIE
+from .megaphone import MegaphoneIE
from .meipai import MeipaiIE
from .melonvod import MelonVODIE
from .meta import METAIE
@@ -581,7 +594,6 @@ from .mixcloud import (
)
from .mlb import MLBIE
from .mnet import MnetIE
-from .mpora import MporaIE
from .moevideo import MoeVideoIE
from .mofosex import MofosexIE
from .mojvideo import MojvideoIE
@@ -670,6 +682,7 @@ from .nick import (
NickIE,
NickDeIE,
NickNightIE,
+ NickRuIE,
)
from .niconico import NiconicoIE, NiconicoPlaylistIE
from .ninecninemedia import (
@@ -837,6 +850,10 @@ from .rai import (
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redbulltv import RedBullTVIE
+from .reddit import (
+ RedditIE,
+ RedditRIE,
+)
from .redtube import RedTubeIE
from .regiotv import RegioTVIE
from .rentv import (
@@ -930,8 +947,9 @@ from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
SoundcloudUserIE,
+ SoundcloudTrackStationIE,
SoundcloudPlaylistIE,
- SoundcloudSearchIE
+ SoundcloudSearchIE,
)
from .soundgasm import (
SoundgasmIE,
@@ -989,7 +1007,6 @@ from .teachertube import (
)
from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
-from .teamfourstar import TeamFourStarIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tele13 import Tele13IE
@@ -1218,6 +1235,7 @@ from .vodlocker import VodlockerIE
from .vodpl import VODPlIE
from .vodplatform import VODPlatformIE
from .voicerepublic import VoiceRepublicIE
+from .voot import VootIE
from .voxmedia import VoxMediaIE
from .vporn import VpornIE
from .vrt import VRTIE
@@ -1239,6 +1257,7 @@ from .washingtonpost import (
WashingtonPostArticleIE,
)
from .wat import WatIE
+from .watchbox import WatchBoxIE
from .watchindianporn import WatchIndianPornIE
from .wdr import (
WDRIE,
@@ -1293,6 +1312,7 @@ from .yandexmusic import (
YandexMusicAlbumIE,
YandexMusicPlaylistIE,
)
+from .yandexdisk import YandexDiskIE
from .yesjapan import YesJapanIE
from .yinyuetai import YinYueTaiIE
from .ynet import YnetIE
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index e3fd08bcf..ad273a0e7 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -3,39 +3,22 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
parse_duration,
parse_iso8601,
- sanitized_Request,
str_to_int,
)
-class FourTubeIE(InfoExtractor):
- IE_NAME = '4tube'
- _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'
+class FourTubeBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ kind, video_id, display_id = mobj.group('kind', 'id', 'display_id')
- _TEST = {
- 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
- 'md5': '6516c8ac63b03de06bc8eac14362db4f',
- 'info_dict': {
- 'id': '209733',
- 'ext': 'mp4',
- 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
- 'uploader': 'WCP Club',
- 'uploader_id': 'wcp-club',
- 'upload_date': '20131031',
- 'timestamp': 1383263892,
- 'duration': 583,
- 'view_count': int,
- 'like_count': int,
- 'categories': list,
- 'age_limit': 18,
- }
- }
+ if kind == 'm' or not display_id:
+ url = self._URL_TEMPLATE % video_id
- def _real_extract(self, url):
- video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta('name', webpage)
@@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor):
'uploadDate', webpage))
thumbnail = self._html_search_meta('thumbnailUrl', webpage)
uploader_id = self._html_search_regex(
- r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
+ r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">',
webpage, 'uploader id', fatal=False)
uploader = self._html_search_regex(
- r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
+ r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">',
webpage, 'uploader', fatal=False)
categories_html = self._search_regex(
@@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor):
view_count = str_to_int(self._search_regex(
r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">',
- webpage, 'view count', fatal=False))
+ webpage, 'view count', default=None))
like_count = str_to_int(self._search_regex(
r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">',
- webpage, 'like count', fatal=False))
+ webpage, 'like count', default=None))
duration = parse_duration(self._html_search_meta('duration', webpage))
media_id = self._search_regex(
@@ -87,12 +70,12 @@ class FourTubeIE(InfoExtractor):
token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format(
media_id, '+'.join(sources))
- headers = {
- b'Content-Type': b'application/x-www-form-urlencoded',
- b'Origin': b'https://www.4tube.com',
- }
- token_req = sanitized_Request(token_url, b'{}', headers)
- tokens = self._download_json(token_req, video_id)
+
+ parsed_url = compat_urlparse.urlparse(url)
+ tokens = self._download_json(token_url, video_id, data=b'', headers={
+ 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname),
+ 'Referer': url,
+ })
formats = [{
'url': tokens[format]['token'],
'format_id': format + 'p',
@@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor):
'duration': duration,
'age_limit': 18,
}
+
+
+class FourTubeIE(FourTubeBaseIE):
+ IE_NAME = '4tube'
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+ _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video'
+ _TESTS = [{
+ 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+ 'md5': '6516c8ac63b03de06bc8eac14362db4f',
+ 'info_dict': {
+ 'id': '209733',
+ 'ext': 'mp4',
+ 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
+ 'uploader': 'WCP Club',
+ 'uploader_id': 'wcp-club',
+ 'upload_date': '20131031',
+ 'timestamp': 1383263892,
+ 'duration': 583,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ }, {
+ 'url': 'http://www.4tube.com/embed/209733',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+ 'only_matching': True,
+ }]
+
+
+class FuxIE(FourTubeBaseIE):
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+ _URL_TEMPLATE = 'https://www.fux.com/video/%s/video'
+ _TESTS = [{
+ 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+ 'info_dict': {
+ 'id': '195359',
+ 'ext': 'mp4',
+ 'title': 'Awesome fucking in the kitchen ends with cum swallow',
+ 'uploader': 'alenci2342',
+ 'uploader_id': 'alenci2342',
+ 'upload_date': '20131230',
+ 'timestamp': 1388361660,
+ 'duration': 289,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.fux.com/embed/195359',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+ 'only_matching': True,
+ }]
+
+
+class PornTubeIE(FourTubeBaseIE):
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+ _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s'
+ _TESTS = [{
+ 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759',
+ 'info_dict': {
+ 'id': '7089759',
+ 'ext': 'mp4',
+ 'title': 'Teen couple doing anal',
+ 'uploader': 'Alexy',
+ 'uploader_id': 'Alexy',
+ 'upload_date': '20150606',
+ 'timestamp': 1433595647,
+ 'duration': 5052,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.porntube.com/embed/7089759',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759',
+ 'only_matching': True,
+ }]
+
+
+class PornerBrosIE(FourTubeBaseIE):
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+ _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s'
+ _TESTS = [{
+ 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+ 'md5': '6516c8ac63b03de06bc8eac14362db4f',
+ 'info_dict': {
+ 'id': '181369',
+ 'ext': 'mp4',
+ 'title': 'Skinny brunette takes big cock down her anal hole',
+ 'uploader': 'PornerBros HD',
+ 'uploader_id': 'pornerbros-hd',
+ 'upload_date': '20130130',
+ 'timestamp': 1359527401,
+ 'duration': 1224,
+ 'view_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.pornerbros.com/embed/181369',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 49409369c..f85e7de14 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -1,10 +1,14 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ unified_timestamp,
+)
class FunnyOrDieIE(InfoExtractor):
@@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Heart-Shaped Box: Literal Video Version',
'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
'thumbnail': r're:^http:.*\.jpg$',
+ 'uploader': 'DASjr',
+ 'timestamp': 1317904928,
+ 'upload_date': '20111006',
+ 'duration': 318.3,
},
}, {
'url': 'http://www.funnyordie.com/embed/e402820827',
@@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Please Use This Song (Jon Lajoie)',
'description': 'Please use this to sell something. www.jonlajoie.com',
'thumbnail': r're:^http:.*\.jpg$',
+ 'timestamp': 1398988800,
+ 'upload_date': '20140502',
},
'params': {
'skip_download': True,
@@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
'url': 'http://www.funnyordie.com%s' % src,
}]
- post_json = self._search_regex(
- r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
- post = json.loads(post_json)
+ timestamp = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp', default=None))
+
+ uploader = self._html_search_regex(
+ r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
+ webpage, 'uploader', default=None)
+
+ title, description, thumbnail, duration = [None] * 4
+
+ medium = self._parse_json(
+ self._search_regex(
+ r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
+ default='{}'),
+ video_id, fatal=False)
+ if medium:
+ title = medium.get('title')
+ duration = float_or_none(medium.get('duration'))
+ if not timestamp:
+ timestamp = unified_timestamp(medium.get('publishDate'))
+
+ post = self._parse_json(
+ self._search_regex(
+ r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
+ default='{}'),
+ video_id, fatal=False)
+ if post:
+ if not title:
+ title = post.get('name')
+ description = post.get('description')
+ thumbnail = post.get('picture')
+
+ if not title:
+ title = self._og_search_title(webpage)
+ if not description:
+ description = self._og_search_description(webpage)
+ if not duration:
+ duration = int_or_none(self._html_search_meta(
+ ('video:duration', 'duration'), webpage, 'duration', default=False))
return {
'id': video_id,
- 'title': post['name'],
- 'description': post.get('description'),
- 'thumbnail': post.get('picture'),
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'timestamp': timestamp,
+ 'duration': duration,
'formats': formats,
'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 0ab2ef2d6..49b00b87e 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -97,6 +97,8 @@ from .washingtonpost import WashingtonPostIE
from .wistia import WistiaIE
from .mediaset import MediasetIE
from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
class GenericIE(InfoExtractor):
@@ -574,6 +576,19 @@ class GenericIE(InfoExtractor):
},
'skip': 'movie expired',
},
+ # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+ {
+ 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+ 'info_dict': {
+ 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+ 'ext': 'mp4',
+ 'title': 'Steampunk Fest Comes to Honesdale',
+ 'duration': 43.276,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -1504,14 +1519,27 @@ class GenericIE(InfoExtractor):
# LiveLeak embed
{
'url': 'http://www.wykop.pl/link/3088787/',
- 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+ 'md5': '7619da8c820e835bef21a1efa2a0fc71',
'info_dict': {
'id': '874_1459135191',
'ext': 'mp4',
'title': 'Man shows poor quality of new apartment building',
'description': 'The wall is like a sand pile.',
'uploader': 'Lake8737',
- }
+ },
+ 'add_ie': [LiveLeakIE.ie_key()],
+ },
+ # Another LiveLeak embed pattern (#13336)
+ {
+ 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
+ 'info_dict': {
+ 'id': '2eb_1496309988',
+ 'ext': 'mp4',
+ 'title': 'Thief robs place where everyone was armed',
+ 'description': 'md5:694d73ee79e535953cf2488562288eee',
+ 'uploader': 'brazilwtf',
+ },
+ 'add_ie': [LiveLeakIE.ie_key()],
},
# Duplicated embedded video URLs
{
@@ -1569,27 +1597,6 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
- # Nexx iFrame embed
- {
- 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
- 'info_dict': {
- 'id': '161464',
- 'ext': 'mp4',
- 'title': 'Nervenkitzel Achterbahn',
- 'alt_title': 'Karussellbauer in Deutschland',
- 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
- 'release_year': 2005,
- 'creator': 'SPIEGEL TV',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 2761,
- 'timestamp': 1394021479,
- 'upload_date': '20140305',
- },
- 'params': {
- 'format': 'bestvideo',
- 'skip_download': True,
- },
- },
# Facebook <iframe> embed
{
'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@@ -1792,6 +1799,21 @@ class GenericIE(InfoExtractor):
'playlist_mincount': 5,
},
{
+ # Limelight embed (LimelightPlayerUtil.embed)
+ 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+ 'info_dict': {
+ 'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+ 'ext': 'mp4',
+ 'title': '07448641',
+ 'timestamp': 1499890639,
+ 'upload_date': '20170712',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['LimelightMedia'],
+ },
+ {
'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
'info_dict': {
'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
@@ -1847,6 +1869,16 @@ class GenericIE(InfoExtractor):
'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
},
},
+ {
+ # vzaar embed
+ 'url': 'http://help.vzaar.com/article/165-embedding-video',
+ 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+ 'info_dict': {
+ 'id': '8707641',
+ 'ext': 'mp4',
+ 'title': 'Building A Business Online: Principal Chairs Q & A',
+ },
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1996,7 +2028,7 @@ class GenericIE(InfoExtractor):
if head_response is not False:
# Check for redirect
- new_url = head_response.geturl()
+ new_url = compat_str(head_response.geturl())
if url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
@@ -2097,7 +2129,7 @@ class GenericIE(InfoExtractor):
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'] = self._parse_mpd_formats(
doc, video_id,
- mpd_base_url=full_response.geturl().rpartition('/')[0],
+ mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
mpd_url=url)
self._sort_formats(info_dict['formats'])
return info_dict
@@ -2313,6 +2345,7 @@ class GenericIE(InfoExtractor):
# Look for Ooyala videos
mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
+ re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
@@ -2737,9 +2770,9 @@ class GenericIE(InfoExtractor):
self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
# Look for LiveLeak embeds
- liveleak_url = LiveLeakIE._extract_url(webpage)
- if liveleak_url:
- return self.url_result(liveleak_url, 'LiveLeak')
+ liveleak_urls = LiveLeakIE._extract_urls(webpage)
+ if liveleak_urls:
+ return self.playlist_from_matches(liveleak_urls, video_id, video_title)
# Look for 3Q SDN embeds
threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
@@ -2811,6 +2844,18 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
joj_urls, video_id, video_title, ie=JojIE.ie_key())
+ # Look for megaphone.fm embeds
+ mpfn_urls = MegaphoneIE._extract_urls(webpage)
+ if mpfn_urls:
+ return self.playlist_from_matches(
+ mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+ # Look for vzaar embeds
+ vzaar_urls = VzaarIE._extract_urls(webpage)
+ if vzaar_urls:
+ return self.playlist_from_matches(
+ vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
def merge_dicts(dict1, dict2):
merged = {}
for k, v in dict1.items():
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py
index f3156804d..26c48e4b8 100644
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@@ -59,12 +59,18 @@ class ITVIE(InfoExtractor):
def _add_sub_element(element, name):
return etree.SubElement(element, _add_ns(name))
+ production_id = (
+ params.get('data-video-autoplay-id') or
+ '%s#001' % (
+ params.get('data-video-episode-id') or
+ video_id.replace('a', '/')))
+
req_env = etree.Element(_add_ns('soapenv:Envelope'))
_add_sub_element(req_env, 'soapenv:Header')
body = _add_sub_element(req_env, 'soapenv:Body')
get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
request = _add_sub_element(get_playlist, 'tem:request')
- _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id']
+ _add_sub_element(request, 'itv:ProductionId').text = production_id
_add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
vodcrid = _add_sub_element(request, 'itv:Vodcrid')
_add_sub_element(vodcrid, 'com:Id')
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 1f91ba017..c7f813370 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -215,3 +215,21 @@ class Laola1TvIE(Laola1TvEmbedIE):
'formats': formats,
'is_live': is_live,
}
+
+
+class ITTFIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ return self.url_result(
+ update_url_query('https://www.laola1.tv/titanplayer.php', {
+ 'videoid': self._match_id(url),
+ 'type': 'V',
+ 'lang': 'en',
+ 'portal': 'int',
+ 'customer': 1024,
+ }), Laola1TvEmbedIE.ie_key())
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
index 0a5a3956c..ad65b2759 100644
--- a/youtube_dl/extractor/limelight.py
+++ b/youtube_dl/extractor/limelight.py
@@ -26,14 +26,16 @@ class LimelightBaseIE(InfoExtractor):
'Channel': 'channel',
'ChannelList': 'channel_list',
}
+
+ def smuggle(url):
+ return smuggle_url(url, {'source_url': source_url})
+
entries = []
for kind, video_id in re.findall(
r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
webpage):
entries.append(cls.url_result(
- smuggle_url(
- 'limelight:%s:%s' % (lm[kind], video_id),
- {'source_url': source_url}),
+ smuggle('limelight:%s:%s' % (lm[kind], video_id)),
'Limelight%s' % kind, video_id))
for mobj in re.finditer(
# As per [1] class attribute should be exactly equal to
@@ -49,10 +51,15 @@ class LimelightBaseIE(InfoExtractor):
''', webpage):
kind, video_id = mobj.group('kind'), mobj.group('id')
entries.append(cls.url_result(
- smuggle_url(
- 'limelight:%s:%s' % (kind, video_id),
- {'source_url': source_url}),
+ smuggle('limelight:%s:%s' % (kind, video_id)),
'Limelight%s' % kind.capitalize(), video_id))
+ # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page)
+ for video_id in re.findall(
+ r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})',
+ webpage):
+ entries.append(cls.url_result(
+ smuggle('limelight:media:%s' % video_id),
+ LimelightMediaIE.ie_key(), video_id))
return entries
def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index b2247a84d..246aac576 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -72,15 +72,20 @@ class LiveLeakIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ 'url': 'https://www.liveleak.com/view?i=677_1439397581',
+ 'info_dict': {
+ 'id': '677_1439397581',
+ 'title': 'Fuel Depot in China Explosion caught on video',
+ },
+ 'playlist_count': 3,
}]
@staticmethod
- def _extract_url(webpage):
- mobj = re.search(
- r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)',
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"',
webpage)
- if mobj:
- return 'http://www.liveleak.com/view?i=%s' % mobj.group('id')
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -111,23 +116,54 @@ class LiveLeakIE(InfoExtractor):
'age_limit': age_limit,
}
- info_dict = entries[0]
+ for idx, info_dict in enumerate(entries):
+ for a_format in info_dict['formats']:
+ if not a_format.get('height'):
+ a_format['height'] = int_or_none(self._search_regex(
+ r'([0-9]+)p\.mp4', a_format['url'], 'height label',
+ default=None))
+
+ self._sort_formats(info_dict['formats'])
+
+ # Don't append entry ID for one-video pages to keep backward compatibility
+ if len(entries) > 1:
+ info_dict['id'] = '%s_%s' % (video_id, idx + 1)
+ else:
+ info_dict['id'] = video_id
- for a_format in info_dict['formats']:
- if not a_format.get('height'):
- a_format['height'] = int_or_none(self._search_regex(
- r'([0-9]+)p\.mp4', a_format['url'], 'height label',
- default=None))
+ info_dict.update({
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'age_limit': age_limit,
+ 'thumbnail': video_thumbnail,
+ })
- self._sort_formats(info_dict['formats'])
+ return self.playlist_result(entries, video_id, video_title)
+
+
+class LiveLeakEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)'
+
+ # See generic.py for actual test cases
+ _TESTS = [{
+ 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ kind, video_id = mobj.group('kind', 'id')
- info_dict.update({
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- 'thumbnail': video_thumbnail,
- })
+ if kind == 'f':
+ webpage = self._download_webpage(url, video_id)
+ liveleak_url = self._search_regex(
+ r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
+ webpage, 'LiveLeak URL', group='url')
+ elif kind == 'i':
+ liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id
- return info_dict
+ return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())
diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py
new file mode 100644
index 000000000..60e3caf0d
--- /dev/null
+++ b/youtube_dl/extractor/megaphone.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class MegaphoneIE(InfoExtractor):
+ IE_NAME = 'megaphone.fm'
+ IE_DESC = 'megaphone.fm embedded players'
+ _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)'
+ _TEST = {
+ 'url': 'https://player.megaphone.fm/GLT9749789991?"',
+ 'md5': '4816a0de523eb3e972dc0dda2c191f96',
+ 'info_dict': {
+ 'id': 'GLT9749789991',
+ 'ext': 'mp3',
+ 'title': '#97 What Kind Of Idiot Gets Phished?',
+ 'thumbnail': 're:^https://.*\.png.*$',
+ 'duration': 1776.26375,
+ 'author': 'Reply All',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_property('audio:title', webpage)
+ author = self._og_search_property('audio:artist', webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON')
+ episode_data = self._parse_json(episode_json, video_id, js_to_json)
+ video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:')
+
+ formats = [{
+ 'url': video_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'thumbnail': thumbnail,
+ 'title': title,
+ 'author': author,
+ 'duration': episode_data['duration'],
+ 'formats': formats,
+ }
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return [m[0] for m in re.findall(
+ r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)]
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 0efbe660a..798968ae3 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -9,6 +9,7 @@ from .common import InfoExtractor
from ..compat import (
compat_chr,
compat_ord,
+ compat_str,
compat_urllib_parse_unquote,
compat_urlparse,
)
@@ -53,16 +54,27 @@ class MixcloudIE(InfoExtractor):
'only_matching': True,
}]
- # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
- @staticmethod
- def _decrypt_play_info(play_info):
- KEY = 'pleasedontdownloadourmusictheartistswontgetpaid'
+ _keys = [
+ 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };',
+ 'pleasedontdownloadourmusictheartistswontgetpaid',
+ 'window.addEventListener = window.addEventListener || function() {};',
+ '(function() { return new Date().toLocaleDateString(); })()'
+ ]
+ _current_key = None
+ # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
+ def _decrypt_play_info(self, play_info, video_id):
play_info = base64.b64decode(play_info.encode('ascii'))
-
- return ''.join([
- compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)]))
- for idx, ch in enumerate(play_info)])
+ for num, key in enumerate(self._keys, start=1):
+ try:
+ return self._parse_json(
+ ''.join([
+ compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)]))
+ for idx, ch in enumerate(play_info)]),
+ video_id)
+ except ExtractorError:
+ if num == len(self._keys):
+ raise
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -72,14 +84,30 @@ class MixcloudIE(InfoExtractor):
webpage = self._download_webpage(url, track_id)
+ if not self._current_key:
+ js_url = self._search_regex(
+ r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
+ webpage, 'js url', default=None)
+ if js_url:
+ js = self._download_webpage(js_url, track_id, fatal=False)
+ if js:
+ KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
+ for key_name in ('value', 'key_value'):
+ key = self._search_regex(
+ KEY_RE_TEMPLATE % key_name, js, 'key',
+ default=None, group='key')
+ if key and isinstance(key, compat_str):
+ self._keys.insert(0, key)
+ self._current_key = key
+
message = self._html_search_regex(
r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
webpage, 'error message', default=None)
encrypted_play_info = self._search_regex(
r'm-play-info="([^"]+)"', webpage, 'play info')
- play_info = self._parse_json(
- self._decrypt_play_info(encrypted_play_info), track_id)
+
+ play_info = self._decrypt_play_info(encrypted_play_info, track_id)
if message and 'stream_url' not in play_info:
raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index 59cd4b838..675ff6873 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -15,7 +15,7 @@ class MLBIE(InfoExtractor):
(?:[\da-z_-]+\.)*mlb\.com/
(?:
(?:
- (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+ (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)|
(?:
shared/video/embed/(?:embed|m-internal-embed)\.html|
(?:[^/]+/)+(?:play|index)\.jsp|
@@ -84,7 +84,7 @@ class MLBIE(InfoExtractor):
},
{
'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
- 'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+ 'md5': 'aafaf5b0186fee8f32f20508092f8111',
'info_dict': {
'id': '75609783',
'ext': 'mp4',
@@ -95,6 +95,10 @@ class MLBIE(InfoExtractor):
}
},
{
+ 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694',
+ 'only_matching': True,
+ },
+ {
'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
'only_matching': True,
},
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
deleted file mode 100644
index 5a1bee5c8..000000000
--- a/youtube_dl/extractor/mpora.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class MporaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)'
- IE_NAME = 'MPORA'
-
- _TEST = {
- 'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de',
- 'md5': 'a7a228473eedd3be741397cf452932eb',
- 'info_dict': {
- 'id': 'AAdo8okx4wiz',
- 'ext': 'mp4',
- 'title': 'Katy Curd - Winter in the Forest',
- 'duration': 416,
- 'uploader': 'Peter Newman Media',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- data_json = self._search_regex(
- [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;",
- r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"],
- webpage, 'json')
- data = self._parse_json(data_json, video_id)
-
- uploader = data['info_overlay'].get('username')
- duration = data['video']['duration'] // 1000
- thumbnail = data['video']['encodings']['sd']['poster']
- title = data['info_overlay']['title']
-
- formats = []
- for encoding_id, edata in data['video']['encodings'].items():
- for src in edata['sources']:
- width_str = self._search_regex(
- r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'],
- False, default=None)
- vcodec = src['type'].partition('/')[2]
-
- formats.append({
- 'format_id': encoding_id + '-' + vcodec,
- 'url': src['src'],
- 'vcodec': vcodec,
- 'width': int_or_none(width_str),
- })
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'uploader': uploader,
- 'duration': duration,
- 'thumbnail': thumbnail,
- }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 8acea1461..25af5ddfd 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -50,8 +50,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
thumb_node = itemdoc.find(search_path)
if thumb_node is None:
return None
- else:
- return thumb_node.attrib['url']
+ return thumb_node.get('url') or thumb_node.text or None
def _extract_mobile_video_formats(self, mtvn_id):
webpage_url = self._MOBILE_TEMPLATE % mtvn_id
@@ -83,7 +82,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
hls_url = rendition.find('./src').text
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls'))
+ m3u8_id='hls', fatal=False))
else:
# fms
try:
@@ -106,7 +105,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
}])
except (KeyError, TypeError):
raise ExtractorError('Invalid rendition field.')
- self._sort_formats(formats)
+ if formats:
+ self._sort_formats(formats)
return formats
def _extract_subtitles(self, mdoc, mtvn_id):
@@ -133,8 +133,11 @@ class MTVServicesInfoExtractor(InfoExtractor):
mediagen_url += 'acceptMethods='
mediagen_url += 'hls' if use_hls else 'fms'
- mediagen_doc = self._download_xml(mediagen_url, video_id,
- 'Downloading video urls')
+ mediagen_doc = self._download_xml(
+ mediagen_url, video_id, 'Downloading video urls', fatal=False)
+
+ if mediagen_doc is False:
+ return None
item = mediagen_doc.find('./video/item')
if item is not None and item.get('type') == 'text':
@@ -174,6 +177,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id)
+ # Some parts of complete video may be missing (e.g. missing Act 3 in
+ # http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
+ if not formats:
+ return None
+
+ self._sort_formats(formats)
+
return {
'title': title,
'formats': formats,
@@ -205,9 +215,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
title = xpath_text(idoc, './channel/title')
description = xpath_text(idoc, './channel/description')
+ entries = []
+ for item in idoc.findall('.//item'):
+ info = self._get_video_info(item, use_hls)
+ if info:
+ entries.append(info)
+
return self.playlist_result(
- [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')],
- playlist_title=title, playlist_description=description)
+ entries, playlist_title=title, playlist_description=description)
def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
triforce_feed = self._parse_json(self._search_regex(
diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py
index 08a75929e..510b1c41f 100644
--- a/youtube_dl/extractor/nick.py
+++ b/youtube_dl/extractor/nick.py
@@ -12,6 +12,7 @@ class NickIE(MTVServicesInfoExtractor):
IE_NAME = 'nick.com'
_VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
_FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
+ _GEO_COUNTRIES = ['US']
_TESTS = [{
'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
'playlist': [
@@ -74,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor):
class NickDeIE(MTVServicesInfoExtractor):
IE_NAME = 'nick.de'
- _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',
'only_matching': True,
@@ -87,6 +88,9 @@ class NickDeIE(MTVServicesInfoExtractor):
}, {
'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht',
'only_matching': True,
+ }, {
+ 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom',
+ 'only_matching': True,
}]
def _extract_mrss_url(self, webpage, host):
@@ -124,3 +128,21 @@ class NickNightIE(NickDeIE):
return self._search_regex(
r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage,
'mrss url', group='url')
+
+
+class NickRuIE(MTVServicesInfoExtractor):
+ IE_NAME = 'nickelodeonru'
+ _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ mgid = self._extract_mgid(webpage)
+ return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 695e32e59..026329d3e 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -11,10 +11,15 @@ from ..compat import (
)
from ..utils import (
determine_ext,
+ dict_get,
ExtractorError,
int_or_none,
+ float_or_none,
parse_duration,
parse_iso8601,
+ remove_start,
+ try_get,
+ unified_timestamp,
urlencode_postdata,
xpath_text,
)
@@ -31,12 +36,15 @@ class NiconicoIE(InfoExtractor):
'id': 'sm22312215',
'ext': 'mp4',
'title': 'Big Buck Bunny',
+ 'thumbnail': r're:https?://.*',
'uploader': 'takuya0301',
'uploader_id': '2698420',
'upload_date': '20131123',
'timestamp': 1385182762,
'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
'duration': 33,
+ 'view_count': int,
+ 'comment_count': int,
},
'skip': 'Requires an account',
}, {
@@ -48,6 +56,7 @@ class NiconicoIE(InfoExtractor):
'ext': 'swf',
'title': '【鏡音リン】Dance on media【オリジナル】take2!',
'description': 'md5:689f066d74610b3b22e0f1739add0f58',
+ 'thumbnail': r're:https?://.*',
'uploader': 'りょうた',
'uploader_id': '18822557',
'upload_date': '20110429',
@@ -64,9 +73,11 @@ class NiconicoIE(InfoExtractor):
'ext': 'unknown_video',
'description': 'deleted',
'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
+ 'thumbnail': r're:https?://.*',
'upload_date': '20071224',
'timestamp': int, # timestamp field has different value if logged in
'duration': 304,
+ 'view_count': int,
},
'skip': 'Requires an account',
}, {
@@ -76,6 +87,7 @@ class NiconicoIE(InfoExtractor):
'ext': 'mp4',
'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',
'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
+ 'thumbnail': r're:https?://.*',
'timestamp': 1388851200,
'upload_date': '20140104',
'uploader': 'アニメロチャンネル',
@@ -83,6 +95,44 @@ class NiconicoIE(InfoExtractor):
},
'skip': 'The viewing period of the video you were searching for has expired.',
}, {
+ # video not available via `getflv`; "old" HTML5 video
+ 'url': 'http://www.nicovideo.jp/watch/sm1151009',
+ 'md5': '8fa81c364eb619d4085354eab075598a',
+ 'info_dict': {
+ 'id': 'sm1151009',
+ 'ext': 'mp4',
+ 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)',
+ 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7',
+ 'thumbnail': r're:https?://.*',
+ 'duration': 184,
+ 'timestamp': 1190868283,
+ 'upload_date': '20070927',
+ 'uploader': 'denden2',
+ 'uploader_id': '1392194',
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ 'skip': 'Requires an account',
+ }, {
+ # "New" HTML5 video
+ 'url': 'http://www.nicovideo.jp/watch/sm31464864',
+ 'md5': '351647b4917660986dc0fa8864085135',
+ 'info_dict': {
+ 'id': 'sm31464864',
+ 'ext': 'mp4',
+ 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質',
+ 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb',
+ 'timestamp': 1498514060,
+ 'upload_date': '20170626',
+ 'uploader': 'ゲス',
+ 'uploader_id': '40826363',
+ 'thumbnail': r're:https?://.*',
+ 'duration': 198,
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ 'skip': 'Requires an account',
+ }, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True,
}]
@@ -119,6 +169,84 @@ class NiconicoIE(InfoExtractor):
self._downloader.report_warning('unable to log in: bad username or password')
return login_ok
+ def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
+ def yesno(boolean):
+ return 'yes' if boolean else 'no'
+
+ session_api_data = api_data['video']['dmcInfo']['session_api']
+ session_api_endpoint = session_api_data['urls'][0]
+
+ format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
+
+ session_response = self._download_json(
+ session_api_endpoint['url'], video_id,
+ query={'_format': 'json'},
+ headers={'Content-Type': 'application/json'},
+ note='Downloading JSON metadata for %s' % format_id,
+ data=json.dumps({
+ 'session': {
+ 'client_info': {
+ 'player_id': session_api_data['player_id'],
+ },
+ 'content_auth': {
+ 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]],
+ 'content_key_timeout': session_api_data['content_key_timeout'],
+ 'service_id': 'nicovideo',
+ 'service_user_id': session_api_data['service_user_id']
+ },
+ 'content_id': session_api_data['content_id'],
+ 'content_src_id_sets': [{
+ 'content_src_ids': [{
+ 'src_id_to_mux': {
+ 'audio_src_ids': [audio_quality['id']],
+ 'video_src_ids': [video_quality['id']],
+ }
+ }]
+ }],
+ 'content_type': 'movie',
+ 'content_uri': '',
+ 'keep_method': {
+ 'heartbeat': {
+ 'lifetime': session_api_data['heartbeat_lifetime']
+ }
+ },
+ 'priority': session_api_data['priority'],
+ 'protocol': {
+ 'name': 'http',
+ 'parameters': {
+ 'http_parameters': {
+ 'parameters': {
+ 'http_output_download_parameters': {
+ 'use_ssl': yesno(session_api_endpoint['is_ssl']),
+ 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
+ }
+ }
+ }
+ }
+ },
+ 'recipe_id': session_api_data['recipe_id'],
+ 'session_operation_auth': {
+ 'session_operation_auth_by_signature': {
+ 'signature': session_api_data['signature'],
+ 'token': session_api_data['token'],
+ }
+ },
+ 'timing_constraint': 'unlimited'
+ }
+ }))
+
+ resolution = video_quality.get('resolution', {})
+
+ return {
+ 'url': session_response['data']['session']['content_uri'],
+ 'format_id': format_id,
+ 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
+ 'abr': float_or_none(audio_quality.get('bitrate'), 1000),
+ 'vbr': float_or_none(video_quality.get('bitrate'), 1000),
+ 'height': resolution.get('height'),
+ 'width': resolution.get('width'),
+ }
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -130,30 +258,84 @@ class NiconicoIE(InfoExtractor):
if video_id.startswith('so'):
video_id = self._match_id(handle.geturl())
- video_info = self._download_xml(
- 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
- note='Downloading video info page')
-
- # Get flv info
- flv_info_webpage = self._download_webpage(
- 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
- video_id, 'Downloading flv info')
-
- flv_info = compat_urlparse.parse_qs(flv_info_webpage)
- if 'url' not in flv_info:
- if 'deleted' in flv_info:
- raise ExtractorError('The video has been deleted.',
- expected=True)
- elif 'closed' in flv_info:
- raise ExtractorError('Niconico videos now require logging in',
- expected=True)
- else:
- raise ExtractorError('Unable to find video URL')
-
- video_real_url = flv_info['url'][0]
+ api_data = self._parse_json(self._html_search_regex(
+ 'data-api-data="([^"]+)"', webpage,
+ 'API data', default='{}'), video_id)
+
+ def _format_id_from_url(video_url):
+ return 'economy' if video_real_url.endswith('low') else 'normal'
+
+ try:
+ video_real_url = api_data['video']['smileInfo']['url']
+ except KeyError: # Flash videos
+ # Get flv info
+ flv_info_webpage = self._download_webpage(
+ 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
+ video_id, 'Downloading flv info')
+
+ flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+ if 'url' not in flv_info:
+ if 'deleted' in flv_info:
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
+ elif 'closed' in flv_info:
+ raise ExtractorError('Niconico videos now require logging in',
+ expected=True)
+ elif 'error' in flv_info:
+ raise ExtractorError('%s reports error: %s' % (
+ self.IE_NAME, flv_info['error'][0]), expected=True)
+ else:
+ raise ExtractorError('Unable to find video URL')
+
+ video_info_xml = self._download_xml(
+ 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
+ video_id, note='Downloading video info page')
+
+ def get_video_info(items):
+ if not isinstance(items, list):
+ items = [items]
+ for item in items:
+ ret = xpath_text(video_info_xml, './/' + item)
+ if ret:
+ return ret
+
+ video_real_url = flv_info['url'][0]
+
+ extension = get_video_info('movie_type')
+ if not extension:
+ extension = determine_ext(video_real_url)
+
+ formats = [{
+ 'url': video_real_url,
+ 'ext': extension,
+ 'format_id': _format_id_from_url(video_real_url),
+ }]
+ else:
+ formats = []
+
+ dmc_info = api_data['video'].get('dmcInfo')
+ if dmc_info: # "New" HTML5 videos
+ quality_info = dmc_info['quality']
+ for audio_quality in quality_info['audios']:
+ for video_quality in quality_info['videos']:
+ if not audio_quality['available'] or not video_quality['available']:
+ continue
+ formats.append(self._extract_format_for_quality(
+ api_data, video_id, audio_quality, video_quality))
+
+ self._sort_formats(formats)
+ else: # "Old" HTML5 videos
+ formats = [{
+ 'url': video_real_url,
+ 'ext': 'mp4',
+ 'format_id': _format_id_from_url(video_real_url),
+ }]
+
+ def get_video_info(items):
+ return dict_get(api_data['video'], items)
# Start extracting information
- title = xpath_text(video_info, './/title')
+ title = get_video_info('title')
if not title:
title = self._og_search_title(webpage, default=None)
if not title:
@@ -167,18 +349,15 @@ class NiconicoIE(InfoExtractor):
watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
video_detail = watch_api_data.get('videoDetail', {})
- extension = xpath_text(video_info, './/movie_type')
- if not extension:
- extension = determine_ext(video_real_url)
-
thumbnail = (
- xpath_text(video_info, './/thumbnail_url') or
+ get_video_info(['thumbnail_url', 'thumbnailURL']) or
self._html_search_meta('image', webpage, 'thumbnail', default=None) or
video_detail.get('thumbnail'))
- description = xpath_text(video_info, './/description')
+ description = get_video_info('description')
- timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve'))
+ timestamp = (parse_iso8601(get_video_info('first_retrieve')) or
+ unified_timestamp(get_video_info('postedDateTime')))
if not timestamp:
match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
if match:
@@ -188,7 +367,7 @@ class NiconicoIE(InfoExtractor):
video_detail['postedAt'].replace('/', '-'),
delimiter=' ', timezone=datetime.timedelta(hours=9))
- view_count = int_or_none(xpath_text(video_info, './/view_counter'))
+ view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))
if not view_count:
match = self._html_search_regex(
r'>Views: <strong[^>]*>([^<]+)</strong>',
@@ -197,38 +376,33 @@ class NiconicoIE(InfoExtractor):
view_count = int_or_none(match.replace(',', ''))
view_count = view_count or video_detail.get('viewCount')
- comment_count = int_or_none(xpath_text(video_info, './/comment_num'))
+ comment_count = (int_or_none(get_video_info('comment_num')) or
+ video_detail.get('commentCount') or
+ try_get(api_data, lambda x: x['thread']['commentCount']))
if not comment_count:
match = self._html_search_regex(
r'>Comments: <strong[^>]*>([^<]+)</strong>',
webpage, 'comment count', default=None)
if match:
comment_count = int_or_none(match.replace(',', ''))
- comment_count = comment_count or video_detail.get('commentCount')
duration = (parse_duration(
- xpath_text(video_info, './/length') or
+ get_video_info('length') or
self._html_search_meta(
'video:duration', webpage, 'video duration', default=None)) or
- video_detail.get('length'))
+ video_detail.get('length') or
+ get_video_info('duration'))
- webpage_url = xpath_text(video_info, './/watch_url') or url
+ webpage_url = get_video_info('watch_url') or url
- if video_info.find('.//ch_id') is not None:
- uploader_id = video_info.find('.//ch_id').text
- uploader = video_info.find('.//ch_name').text
- elif video_info.find('.//user_id') is not None:
- uploader_id = video_info.find('.//user_id').text
- uploader = video_info.find('.//user_nickname').text
- else:
- uploader_id = uploader = None
+ owner = api_data.get('owner', {})
+ uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id')
+ uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')
return {
'id': video_id,
- 'url': video_real_url,
'title': title,
- 'ext': extension,
- 'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
+ 'formats': formats,
'thumbnail': thumbnail,
'description': description,
'uploader': uploader,
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 516b1e941..fa4ef20c5 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -28,7 +28,7 @@ class NPOBaseIE(InfoExtractor):
class NPOIE(NPOBaseIE):
IE_NAME = 'npo'
- IE_DESC = 'npo.nl and ntr.nl'
+ IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
_VALID_URL = r'''(?x)
(?:
npo:|
@@ -38,7 +38,7 @@ class NPOIE(NPOBaseIE):
npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}|
ntr\.nl/(?:[^/]+/){2,}|
omroepwnl\.nl/video/fragment/[^/]+__|
- zapp\.nl/[^/]+/[^/]+/
+ (?:zapp|npo3)\.nl/(?:[^/]+/){2}
)
)
(?P<id>[^/?#]+)
@@ -147,6 +147,9 @@ class NPOIE(NPOBaseIE):
'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
'only_matching': True,
}, {
+ 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
+ 'only_matching': True,
+ }, {
# live stream
'url': 'npo:LI_NL1_4188102',
'only_matching': True,
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 3b4f51f61..18ead9426 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE):
(?:/\d{2}-\d{2}-\d{4})?
(?:\#del=(?P<part_id>\d+))?
''' % _EPISODE_RE
- _API_HOST = 'psapi-we.nrk.no'
+ _API_HOST = 'psapi-ne.nrk.no'
_TESTS = [{
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 16cc667d0..8889e4a1a 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -189,7 +189,7 @@ class PBSIE(InfoExtractor):
# Direct video URL
(?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
# Article with embedded player (or direct video)
- (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+ (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
@@ -346,6 +346,21 @@ class PBSIE(InfoExtractor):
},
},
{
+ # https://github.com/rg3/youtube-dl/issues/13801
+ 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+ 'info_dict': {
+ 'id': '3003333873',
+ 'ext': 'mp4',
+ 'title': 'PBS NewsHour - full episode July 31, 2017',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'duration': 3265,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
},
@@ -433,6 +448,9 @@ class PBSIE(InfoExtractor):
if url:
break
+ if not url:
+ url = self._og_search_url(webpage)
+
mobj = re.match(self._VALID_URL, url)
player_id = mobj.group('player_id')
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index bfa12edc9..e5e08538c 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -80,18 +80,24 @@ class PeriscopeIE(PeriscopeBaseIE):
stream = self._call_api(
'getAccessPublic', {'broadcast_id': token}, token)
+ video_urls = set()
formats = []
- for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
+ for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
video_url = stream.get(format_id + '_url')
- if not video_url:
+ if not video_url or video_url in video_urls:
continue
- f = {
+ video_urls.add(video_url)
+ if format_id != 'rtmp':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, token, 'mp4',
+ entry_protocol='m3u8_native'
+ if state in ('ended', 'timed_out') else 'm3u8',
+ m3u8_id=format_id, fatal=False))
+ continue
+ formats.append({
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
- }
- if format_id != 'rtmp':
- f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8'
- formats.append(f)
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
index e45d9fe55..f6a9131b1 100644
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -18,6 +18,7 @@ from ..utils import (
parse_duration,
qualities,
srt_subtitles_timecode,
+ try_get,
update_url_query,
urlencode_postdata,
)
@@ -26,6 +27,39 @@ from ..utils import (
class PluralsightBaseIE(InfoExtractor):
_API_BASE = 'https://app.pluralsight.com'
+ def _download_course(self, course_id, url, display_id):
+ try:
+ return self._download_course_rpc(course_id, url, display_id)
+ except ExtractorError:
+ # Old API fallback
+ return self._download_json(
+ 'https://app.pluralsight.com/player/user/api/v1/player/payload',
+ display_id, data=urlencode_postdata({'courseId': course_id}),
+ headers={'Referer': url})
+
+ def _download_course_rpc(self, course_id, url, display_id):
+ response = self._download_json(
+ '%s/player/functions/rpc' % self._API_BASE, display_id,
+ 'Downloading course JSON',
+ data=json.dumps({
+ 'fn': 'bootstrapPlayer',
+ 'payload': {
+ 'courseId': course_id,
+ },
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json;charset=utf-8',
+ 'Referer': url,
+ })
+
+ course = try_get(response, lambda x: x['payload']['course'], dict)
+ if course:
+ return course
+
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, response['error']['message']),
+ expected=True)
+
class PluralsightIE(PluralsightBaseIE):
IE_NAME = 'pluralsight'
@@ -162,10 +196,7 @@ class PluralsightIE(PluralsightBaseIE):
display_id = '%s-%s' % (name, clip_id)
- course = self._download_json(
- 'https://app.pluralsight.com/player/user/api/v1/player/payload',
- display_id, data=urlencode_postdata({'courseId': course_name}),
- headers={'Referer': url})
+ course = self._download_course(course_name, url, display_id)
collection = course['modules']
@@ -224,6 +255,7 @@ class PluralsightIE(PluralsightBaseIE):
req_format_split = req_format.split('-', 1)
if len(req_format_split) > 1:
req_ext, req_quality = req_format_split
+ req_quality = '-'.join(req_quality.split('-')[:2])
for allowed_quality in ALLOWED_QUALITIES:
if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
return (AllowedQuality(req_ext, (req_quality, )), )
@@ -330,18 +362,7 @@ class PluralsightCourseIE(PluralsightBaseIE):
# TODO: PSM cookie
- course = self._download_json(
- '%s/player/functions/rpc' % self._API_BASE, course_id,
- 'Downloading course JSON',
- data=json.dumps({
- 'fn': 'bootstrapPlayer',
- 'payload': {
- 'courseId': course_id,
- }
- }).encode('utf-8'),
- headers={
- 'Content-Type': 'application/json;charset=utf-8'
- })['payload']['course']
+ course = self._download_course(course_id, url, course_id)
title = course['title']
course_name = course['name']
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py
index f20946a2b..25fcebf9f 100644
--- a/youtube_dl/extractor/podomatic.py
+++ b/youtube_dl/extractor/podomatic.py
@@ -9,39 +9,46 @@ from ..utils import int_or_none
class PodomaticIE(InfoExtractor):
IE_NAME = 'podomatic'
- _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
+ _VALID_URL = r'''(?x)
+ (?P<proto>https?)://
+ (?:
+ (?P<channel>[^.]+)\.podomatic\.com/entry|
+ (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes
+ )/
+ (?P<id>[^/?#&]+)
+ '''
- _TESTS = [
- {
- 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
- 'md5': '84bb855fcf3429e6bf72460e1eed782d',
- 'info_dict': {
- 'id': '2009-01-02T16_03_35-08_00',
- 'ext': 'mp3',
- 'uploader': 'Science Teaching Tips',
- 'uploader_id': 'scienceteachingtips',
- 'title': '64. When the Moon Hits Your Eye',
- 'duration': 446,
- }
- },
- {
- 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
- 'md5': 'd2cf443931b6148e27638650e2638297',
- 'info_dict': {
- 'id': '2013-11-15T16_31_21-08_00',
- 'ext': 'mp3',
- 'uploader': 'Ostbahnhof / Techno Mix',
- 'uploader_id': 'ostbahnhof',
- 'title': 'Einunddreizig',
- 'duration': 3799,
- }
- },
- ]
+ _TESTS = [{
+ 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
+ 'md5': '84bb855fcf3429e6bf72460e1eed782d',
+ 'info_dict': {
+ 'id': '2009-01-02T16_03_35-08_00',
+ 'ext': 'mp3',
+ 'uploader': 'Science Teaching Tips',
+ 'uploader_id': 'scienceteachingtips',
+ 'title': '64. When the Moon Hits Your Eye',
+ 'duration': 446,
+ }
+ }, {
+ 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
+ 'md5': 'd2cf443931b6148e27638650e2638297',
+ 'info_dict': {
+ 'id': '2013-11-15T16_31_21-08_00',
+ 'ext': 'mp3',
+ 'uploader': 'Ostbahnhof / Techno Mix',
+ 'uploader_id': 'ostbahnhof',
+ 'title': 'Einunddreizig',
+ 'duration': 3799,
+ }
+ }, {
+ 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- channel = mobj.group('channel')
+ channel = mobj.group('channel') or mobj.group('channel_2')
json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
'?permalink=true&rtmp=0') %
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 842317e6c..36761788d 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor):
r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
sources = self._parse_json(js_to_json(self._search_regex(
- r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]",
+ r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]",
webpage, 'sources', default='{}')), video_id)
if not sources:
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index e032817f2..3428458af 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -186,7 +186,7 @@ class PornHubIE(InfoExtractor):
title, thumbnail, duration = [None] * 3
video_uploader = self._html_search_regex(
- r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
+ r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
view_count = self._extract_count(
@@ -227,13 +227,20 @@ class PornHubIE(InfoExtractor):
class PornHubPlaylistBaseIE(InfoExtractor):
def _extract_entries(self, webpage):
+ # Only process container div with main playlist content skipping
+ # drop-down menu that uses similar pattern for videos (see
+ # https://github.com/rg3/youtube-dl/issues/11594).
+ container = self._search_regex(
+ r'(?s)(<div[^>]+class=["\']container.+)', webpage,
+ 'container', default=webpage)
+
return [
self.url_result(
'http://www.pornhub.com/%s' % video_url,
PornHubIE.ie_key(), video_title=title)
for video_url, title in orderedSet(re.findall(
r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
- webpage))
+ container))
]
def _real_extract(self, url):
@@ -241,14 +248,7 @@ class PornHubPlaylistBaseIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- # Only process container div with main playlist content skipping
- # drop-down menu that uses similar pattern for videos (see
- # https://github.com/rg3/youtube-dl/issues/11594).
- container = self._search_regex(
- r'(?s)(<div[^>]+class=["\']container.+)', webpage,
- 'container', default=webpage)
-
- entries = self._extract_entries(container)
+ entries = self._extract_entries(webpage)
playlist = self._parse_json(
self._search_regex(
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 17c27da46..084308aeb 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -2,38 +2,37 @@
from __future__ import unicode_literals
import random
-import time
import re
+import time
from .common import InfoExtractor
from ..utils import (
- sanitized_Request,
- strip_jsonp,
- unescapeHTML,
clean_html,
ExtractorError,
+ strip_jsonp,
+ unescapeHTML,
)
class QQMusicIE(InfoExtractor):
IE_NAME = 'qqmusic'
IE_DESC = 'QQ音乐'
- _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
- 'md5': '9ce1c1c8445f561506d2e3cfb0255705',
+ 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html',
+ 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8',
'info_dict': {
'id': '004295Et37taLD',
'ext': 'mp3',
'title': '可惜没如果',
'release_date': '20141227',
'creator': '林俊杰',
- 'description': 'md5:d327722d0361576fde558f1ac68a7065',
+ 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac',
'thumbnail': r're:^https?://.*\.jpg$',
}
}, {
'note': 'There is no mp3-320 version of this song.',
- 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+ 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html',
'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
'info_dict': {
'id': '004MsGEo3DdNxV',
@@ -46,14 +45,14 @@ class QQMusicIE(InfoExtractor):
}
}, {
'note': 'lyrics not in .lrc format',
- 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6',
+ 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html',
'info_dict': {
'id': '001JyApY11tIp6',
'ext': 'mp3',
'title': 'Shadows Over Transylvania',
'release_date': '19970225',
'creator': 'Dark Funeral',
- 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11',
+ 'description': 'md5:c9b20210587cbcd6836a1c597bab4525',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
@@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor):
[r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
detail_info_page, 'album mid', default=None)
if albummid:
- thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+ thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \
% (albummid[-2:-1], albummid[-1], albummid)
guid = self.m_r_get_ruin()
@@ -156,15 +155,39 @@ class QQPlaylistBaseIE(InfoExtractor):
def qq_static_url(category, mid):
return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid)
- @classmethod
- def get_entries_from_page(cls, page):
+ def get_singer_all_songs(self, singmid, num):
+ return self._download_webpage(
+ r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid,
+ query={
+ 'format': 'json',
+ 'inCharset': 'utf8',
+ 'outCharset': 'utf-8',
+ 'platform': 'yqq',
+ 'needNewCode': 0,
+ 'singermid': singmid,
+ 'order': 'listen',
+ 'begin': 0,
+ 'num': num,
+ 'songstatus': 1,
+ })
+
+ def get_entries_from_page(self, singmid):
entries = []
- for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page):
- song_mid = unescapeHTML(item).split('|')[-5]
- entries.append(cls.url_result(
- 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
- song_mid))
+ default_num = 1
+ json_text = self.get_singer_all_songs(singmid, default_num)
+ json_obj_all_songs = self._parse_json(json_text, singmid)
+
+ if json_obj_all_songs['code'] == 0:
+ total = json_obj_all_songs['data']['total']
+ json_text = self.get_singer_all_songs(singmid, total)
+ json_obj_all_songs = self._parse_json(json_text, singmid)
+
+ for item in json_obj_all_songs['data']['list']:
+ if item['musicData'].get('songmid') is not None:
+ songmid = item['musicData']['songmid']
+ entries.append(self.url_result(
+ r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid))
return entries
@@ -172,42 +195,32 @@ class QQPlaylistBaseIE(InfoExtractor):
class QQMusicSingerIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:singer'
IE_DESC = 'QQ音乐 - 歌手'
- _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html'
_TEST = {
- 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
+ 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html',
'info_dict': {
'id': '001BLpXF2DyJe2',
'title': '林俊杰',
'description': 'md5:870ec08f7d8547c29c93010899103751',
},
- 'playlist_count': 12,
+ 'playlist_mincount': 12,
}
def _real_extract(self, url):
mid = self._match_id(url)
- singer_page = self._download_webpage(
- self.qq_static_url('singer', mid), mid, 'Download singer page')
-
- entries = self.get_entries_from_page(singer_page)
-
+ entries = self.get_entries_from_page(mid)
+ singer_page = self._download_webpage(url, mid, 'Download singer page')
singer_name = self._html_search_regex(
- r"singername\s*:\s*'([^']+)'", singer_page, 'singer name',
- default=None)
-
- singer_id = self._html_search_regex(
- r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id',
- default=None)
-
+ r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None)
singer_desc = None
- if singer_id:
- req = sanitized_Request(
- 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id)
- req.add_header(
- 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html')
+ if mid:
singer_desc_page = self._download_xml(
- req, mid, 'Donwload singer description XML')
+ 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid,
+ 'Donwload singer description XML',
+ query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid},
+ headers={'Referer': 'https://y.qq.com/n/yqq/singer/'})
singer_desc = singer_desc_page.find('./data/info/desc').text
@@ -217,10 +230,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
class QQMusicAlbumIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:album'
IE_DESC = 'QQ音乐 - 专辑'
- _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
+ 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html',
'info_dict': {
'id': '000gXCTb2AhRR1',
'title': '我们都是这样长大的',
@@ -228,7 +241,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
},
'playlist_count': 4,
}, {
- 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+ 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html',
'info_dict': {
'id': '002Y5a3b3AlCu3',
'title': '그리고...',
@@ -246,7 +259,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
entries = [
self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+ 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']
) for song in album['list']
]
album_name = album.get('name')
@@ -260,31 +273,30 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
class QQMusicToplistIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:toplist'
IE_DESC = 'QQ音乐 - 排行榜'
- _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=toplist&p=global_123',
+ 'url': 'https://y.qq.com/n/yqq/toplist/123.html',
'info_dict': {
- 'id': 'global_123',
+ 'id': '123',
'title': '美国iTunes榜',
+ 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08',
},
- 'playlist_count': 10,
+ 'playlist_count': 100,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=top_3',
+ 'url': 'https://y.qq.com/n/yqq/toplist/3.html',
'info_dict': {
- 'id': 'top_3',
+ 'id': '3',
'title': '巅峰榜·欧美',
- 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
- '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
- '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
- '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+ 'description': 'md5:5a600d42c01696b26b71f8c4d43407da',
},
'playlist_count': 100,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=global_106',
+ 'url': 'https://y.qq.com/n/yqq/toplist/106.html',
'info_dict': {
- 'id': 'global_106',
+ 'id': '106',
'title': '韩国Mnet榜',
+ 'description': 'md5:cb84b325215e1d21708c615cac82a6e7',
},
'playlist_count': 50,
}]
@@ -292,18 +304,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
def _real_extract(self, url):
list_id = self._match_id(url)
- list_type, num_id = list_id.split("_")
-
toplist_json = self._download_json(
- 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
- % (list_type, num_id),
- list_id, 'Download toplist page')
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id,
+ note='Download toplist page',
+ query={'type': 'toplist', 'topid': list_id, 'format': 'json'})
- entries = [
- self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
- ) for song in toplist_json['songlist']
- ]
+ entries = [self.url_result(
+ 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic',
+ song['data']['songmid'])
+ for song in toplist_json['songlist']]
topinfo = toplist_json.get('topinfo', {})
list_name = topinfo.get('ListName')
@@ -314,10 +323,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
class QQMusicPlaylistIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:playlist'
IE_DESC = 'QQ音乐 - 歌单'
- _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+ 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html',
'info_dict': {
'id': '3462654915',
'title': '韩国5月新歌精选下旬',
@@ -326,7 +335,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
'playlist_count': 40,
'skip': 'playlist gone',
}, {
- 'url': 'http://y.qq.com/#type=taoge&id=1374105607',
+ 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html',
'info_dict': {
'id': '1374105607',
'title': '易入人心的华语民谣',
@@ -339,8 +348,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
list_id = self._match_id(url)
list_json = self._download_json(
- 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
- % list_id, list_id, 'Download list page',
+ 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg',
+ list_id, 'Download list page',
+ query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id},
transform_source=strip_jsonp)
if not len(list_json.get('cdlist', [])):
if list_json.get('code'):
@@ -350,11 +360,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
raise ExtractorError('Unable to get playlist info')
cdlist = list_json['cdlist'][0]
- entries = [
- self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
- ) for song in cdlist['songlist']
- ]
+ entries = [self.url_result(
+ 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'])
+ for song in cdlist['songlist']]
list_name = cdlist.get('dissname')
list_description = clean_html(unescapeHTML(cdlist.get('desc')))
diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py
new file mode 100644
index 000000000..01c85ee01
--- /dev/null
+++ b/youtube_dl/extractor/reddit.py
@@ -0,0 +1,114 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+)
+
+
+class RedditIE(InfoExtractor):
+ _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)'
+ _TEST = {
+ # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
+ 'url': 'https://v.redd.it/zv89llsvexdz',
+ 'md5': '655d06ace653ea3b87bccfb1b27ec99d',
+ 'info_dict': {
+ 'id': 'zv89llsvexdz',
+ 'ext': 'mp4',
+ 'title': 'zv89llsvexdz',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats = self._extract_m3u8_formats(
+ 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id,
+ 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+
+ formats.extend(self._extract_mpd_formats(
+ 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id,
+ mpd_id='dash', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'formats': formats,
+ }
+
+
+class RedditRIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
+ 'info_dict': {
+ 'id': 'zv89llsvexdz',
+ 'ext': 'mp4',
+ 'title': 'That small heart attack.',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1501941939,
+ 'upload_date': '20170805',
+ 'uploader': 'Antw87',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
+ 'only_matching': True,
+ }, {
+ # imgur
+ 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
+ 'only_matching': True,
+ }, {
+ # streamable
+ 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/',
+ 'only_matching': True,
+ }, {
+ # youtube
+ 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ url + '.json', video_id)[0]['data']['children'][0]['data']
+
+ video_url = data['url']
+
+ # Avoid recursing into the same reddit URL
+ if 'reddit.com/' in video_url and '/%s/' % video_id in video_url:
+ raise ExtractorError('No media found', expected=True)
+
+ over_18 = data.get('over_18')
+ if over_18 is True:
+ age_limit = 18
+ elif over_18 is False:
+ age_limit = 0
+ else:
+ age_limit = None
+
+ return {
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'title': data.get('title'),
+ 'thumbnail': data.get('thumbnail'),
+ 'timestamp': float_or_none(data.get('created_utc')),
+ 'uploader': data.get('author'),
+ 'like_count': int_or_none(data.get('ups')),
+ 'dislike_count': int_or_none(data.get('downs')),
+ 'comment_count': int_or_none(data.get('num_comments')),
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 3f1a46bb2..2e52e092b 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -31,6 +31,7 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
+ (?!stations/track)
(?P<uploader>[\w\d-]+)/
(?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
@@ -121,7 +122,7 @@ class SoundcloudIE(InfoExtractor):
},
]
- _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z'
+ _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@staticmethod
@@ -330,7 +331,63 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
}
-class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
+class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
+ _API_BASE = 'https://api.soundcloud.com'
+ _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+ def _extract_playlist(self, base_url, playlist_id, playlist_title):
+ COMMON_QUERY = {
+ 'limit': 50,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': '1',
+ }
+
+ query = COMMON_QUERY.copy()
+ query['offset'] = 0
+
+ next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
+
+ entries = []
+ for i in itertools.count():
+ response = self._download_json(
+ next_href, playlist_id, 'Downloading track page %s' % (i + 1))
+
+ collection = response['collection']
+ if not collection:
+ break
+
+ def resolve_permalink_url(candidates):
+ for cand in candidates:
+ if isinstance(cand, dict):
+ permalink_url = cand.get('permalink_url')
+ entry_id = self._extract_id(cand)
+ if permalink_url and permalink_url.startswith('http'):
+ return permalink_url, entry_id
+
+ for e in collection:
+ permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
+ if permalink_url:
+ entries.append(self.url_result(permalink_url, video_id=entry_id))
+
+ next_href = response.get('next_href')
+ if not next_href:
+ break
+
+ parsed_next_href = compat_urlparse.urlparse(response['next_href'])
+ qs = compat_urlparse.parse_qs(parsed_next_href.query)
+ qs.update(COMMON_QUERY)
+ next_href = compat_urlparse.urlunparse(
+ parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_title,
+ 'entries': entries,
+ }
+
+
+class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:(?:www|m)\.)?soundcloud\.com/
@@ -385,16 +442,13 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
'playlist_mincount': 1,
}]
- _API_BASE = 'https://api.soundcloud.com'
- _API_V2_BASE = 'https://api-v2.soundcloud.com'
-
_BASE_URL_MAP = {
- 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
- 'tracks': '%s/users/%%s/tracks' % _API_BASE,
- 'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
- 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
- 'likes': '%s/users/%%s/likes' % _API_V2_BASE,
- 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
+ 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE,
+ 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
}
_TITLE_MAP = {
@@ -416,57 +470,36 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
resolv_url, uploader, 'Downloading user info')
resource = mobj.group('rsrc') or 'all'
- base_url = self._BASE_URL_MAP[resource] % user['id']
- COMMON_QUERY = {
- 'limit': 50,
- 'client_id': self._CLIENT_ID,
- 'linked_partitioning': '1',
- }
+ return self._extract_playlist(
+ self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']),
+ '%s (%s)' % (user['username'], self._TITLE_MAP[resource]))
- query = COMMON_QUERY.copy()
- query['offset'] = 0
- next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
-
- entries = []
- for i in itertools.count():
- response = self._download_json(
- next_href, uploader, 'Downloading track page %s' % (i + 1))
-
- collection = response['collection']
- if not collection:
- break
-
- def resolve_permalink_url(candidates):
- for cand in candidates:
- if isinstance(cand, dict):
- permalink_url = cand.get('permalink_url')
- entry_id = self._extract_id(cand)
- if permalink_url and permalink_url.startswith('http'):
- return permalink_url, entry_id
+class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
+ IE_NAME = 'soundcloud:trackstation'
+ _TESTS = [{
+ 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
+ 'info_dict': {
+ 'id': '286017854',
+ 'title': 'Track station: your-text',
+ },
+ 'playlist_mincount': 47,
+ }]
- for e in collection:
- permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
- if permalink_url:
- entries.append(self.url_result(permalink_url, video_id=entry_id))
+ def _real_extract(self, url):
+ track_name = self._match_id(url)
- next_href = response.get('next_href')
- if not next_href:
- break
+ webpage = self._download_webpage(url, track_name)
- parsed_next_href = compat_urlparse.urlparse(response['next_href'])
- qs = compat_urlparse.parse_qs(parsed_next_href.query)
- qs.update(COMMON_QUERY)
- next_href = compat_urlparse.urlunparse(
- parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+ track_id = self._search_regex(
+ r'soundcloud:track-stations:(\d+)', webpage, 'track id')
- return {
- '_type': 'playlist',
- 'id': compat_str(user['id']),
- 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
- 'entries': entries,
- }
+ return self._extract_playlist(
+ '%s/stations/soundcloud:track-stations:%s/tracks'
+ % (self._API_V2_BASE, track_id),
+ track_id, 'Track station: %s' % track_name)
class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 8598377b0..84298fee4 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -122,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):
},
'playlist_count': 6,
+ }, {
+ # Nexx iFrame embed
+ 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
+ 'info_dict': {
+ 'id': '161464',
+ 'ext': 'mp4',
+ 'title': 'Nervenkitzel Achterbahn',
+ 'alt_title': 'Karussellbauer in Deutschland',
+ 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+ 'release_year': 2005,
+ 'creator': 'SPIEGEL TV',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2761,
+ 'timestamp': 1394021479,
+ 'upload_date': '20140305',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index e7bd5bf91..54497c880 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -4,7 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
class SportBoxEmbedIE(InfoExtractor):
@@ -14,8 +18,10 @@ class SportBoxEmbedIE(InfoExtractor):
'info_dict': {
'id': '211355',
'ext': 'mp4',
- 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+ 'title': '211355',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 292,
+ 'view_count': int,
},
'params': {
# m3u8 download
@@ -24,6 +30,9 @@ class SportBoxEmbedIE(InfoExtractor):
}, {
'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
'only_matching': True,
+ }, {
+ 'url': 'https://news.sportbox.ru/vdl/player/media/193095',
+ 'only_matching': True,
}]
@staticmethod
@@ -37,36 +46,34 @@ class SportBoxEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- formats = []
-
- def cleanup_js(code):
- # desktop_advert_config contains complex Javascripts and we don't need it
- return js_to_json(re.sub(r'desktop_advert_config.*', '', code))
-
- jwplayer_data = self._parse_json(self._search_regex(
- r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id,
- transform_source=cleanup_js)
-
- hls_url = jwplayer_data.get('hls_url')
- if hls_url:
- formats.extend(self._extract_m3u8_formats(
- hls_url, video_id, ext='mp4', m3u8_id='hls'))
-
- rtsp_url = jwplayer_data.get('rtsp_url')
- if rtsp_url:
- formats.append({
- 'url': rtsp_url,
- 'format_id': 'rtsp',
- })
+ wjplayer_data = self._parse_json(
+ self._search_regex(
+ r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'),
+ video_id, transform_source=js_to_json)
+ formats = []
+ for source in wjplayer_data['sources']:
+ src = source.get('src')
+ if not src:
+ continue
+ if determine_ext(src) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ })
self._sort_formats(formats)
- title = jwplayer_data['node_title']
- thumbnail = jwplayer_data.get('image_url')
+ view_count = int_or_none(self._search_regex(
+ r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
return {
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
+ 'title': video_id,
+ 'thumbnail': wjplayer_data.get('poster'),
+ 'duration': int_or_none(wjplayer_data.get('duration')),
+ 'view_count': view_count,
'formats': formats,
}
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
index 1b5afb73e..48bc4529e 100644
--- a/youtube_dl/extractor/svt.py
+++ b/youtube_dl/extractor/svt.py
@@ -181,7 +181,8 @@ class SVTPlayIE(SVTBaseIE):
if video_id:
data = self._download_json(
- 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
+ 'https://api.svt.se/videoplayer-api/video/%s' % video_id,
+ video_id, headers=self.geo_verification_headers())
info_dict = self._extract_video(data, video_id)
if not info_dict.get('title'):
info_dict['title'] = re.sub(
diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py
index bf93eb868..e9474533f 100644
--- a/youtube_dl/extractor/tbs.py
+++ b/youtube_dl/extractor/tbs.py
@@ -8,6 +8,9 @@ from ..utils import extract_attributes
class TBSIE(TurnerBaseIE):
+ # https://github.com/rg3/youtube-dl/issues/13658
+ _WORKING = False
+
_VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
_TESTS = [{
'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
@@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4',
'title': 'Theatrical Trailer',
'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
- }
+ },
+ 'skip': 'TBS videos are deleted after a while',
}, {
'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
@@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4',
'title': 'You Better Run',
'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
- }
+ },
+ 'skip': 'TBS videos are deleted after a while',
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py
deleted file mode 100644
index a8c6ed7be..000000000
--- a/youtube_dl/extractor/teamfourstar.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from .jwplatform import JWPlatformIE
-from ..utils import unified_strdate
-
-
-class TeamFourStarIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)'
- _TEST = {
- 'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/',
- 'info_dict': {
- 'id': '0WdZO31W',
- 'title': 'TFS Abridged Parody Episode 1',
- 'description': 'md5:d60bc389588ebab2ee7ad432bda953ae',
- 'ext': 'mp4',
- 'timestamp': 1394168400,
- 'upload_date': '20080508',
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- jwplatform_url = JWPlatformIE._extract_url(webpage)
-
- video_title = self._html_search_regex(
- r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>',
- webpage, 'title')
- video_date = unified_strdate(self._html_search_regex(
- r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>',
- webpage, 'date', fatal=False))
- video_description = self._html_search_regex(
- r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>',
- webpage, 'description', fatal=False)
- video_thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- '_type': 'url_transparent',
- 'display_id': display_id,
- 'title': video_title,
- 'description': video_description,
- 'upload_date': video_date,
- 'thumbnail': video_thumbnail,
- 'url': jwplatform_url,
- }
diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py
index 4fd1aa4bf..a42977f39 100644
--- a/youtube_dl/extractor/twentymin.py
+++ b/youtube_dl/extractor/twentymin.py
@@ -50,7 +50,7 @@ class TwentyMinutenIE(InfoExtractor):
@staticmethod
def _extract_urls(webpage):
return [m.group('url') for m in re.finditer(
- r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
webpage)]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 160be1b1b..207c4a6a7 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -15,6 +15,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ js_to_json,
sanitized_Request,
unescapeHTML,
urlencode_postdata,
@@ -73,7 +74,7 @@ class UdemyIE(InfoExtractor):
return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url
checkout_url = unescapeHTML(self._search_regex(
- r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1',
+ r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1',
webpage, 'checkout url', group='url', default=None))
if checkout_url:
raise ExtractorError(
@@ -268,6 +269,25 @@ class UdemyIE(InfoExtractor):
f = add_output_format_meta(f, format_id)
formats.append(f)
+ def extract_subtitles(track_list):
+ if not isinstance(track_list, list):
+ return
+ for track in track_list:
+ if not isinstance(track, dict):
+ continue
+ if track.get('kind') != 'captions':
+ continue
+ src = track.get('src')
+ if not src or not isinstance(src, compat_str):
+ continue
+ lang = track.get('language') or track.get(
+ 'srclang') or track.get('label')
+ sub_dict = automatic_captions if track.get(
+ 'autogenerated') is True else subtitles
+ sub_dict.setdefault(lang, []).append({
+ 'url': src,
+ })
+
download_urls = asset.get('download_urls')
if isinstance(download_urls, dict):
extract_formats(download_urls.get('Video'))
@@ -315,23 +335,16 @@ class UdemyIE(InfoExtractor):
extract_formats(data.get('sources'))
if not duration:
duration = int_or_none(data.get('duration'))
- tracks = data.get('tracks')
- if isinstance(tracks, list):
- for track in tracks:
- if not isinstance(track, dict):
- continue
- if track.get('kind') != 'captions':
- continue
- src = track.get('src')
- if not src or not isinstance(src, compat_str):
- continue
- lang = track.get('language') or track.get(
- 'srclang') or track.get('label')
- sub_dict = automatic_captions if track.get(
- 'autogenerated') is True else subtitles
- sub_dict.setdefault(lang, []).append({
- 'url': src,
- })
+ extract_subtitles(data.get('tracks'))
+
+ if not subtitles and not automatic_captions:
+ text_tracks = self._parse_json(
+ self._search_regex(
+ r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html,
+ 'text tracks', default='{}', group='data'), video_id,
+ transform_source=lambda s: js_to_json(unescapeHTML(s)),
+ fatal=False)
+ extract_subtitles(text_tracks)
self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py
index 6be3774b7..570fa45ea 100644
--- a/youtube_dl/extractor/vh1.py
+++ b/youtube_dl/extractor/vh1.py
@@ -121,7 +121,11 @@ class VH1IE(MTVIE):
idoc = self._download_xml(
doc_url, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
- return self.playlist_result(
- [self._get_video_info(item) for item in idoc.findall('.//item')],
- playlist_id=video_id,
- )
+
+ entries = []
+ for item in idoc.findall('.//item'):
+ info = self._get_video_info(item)
+ if info:
+ entries.append(info)
+
+ return self.playlist_result(entries, playlist_id=video_id)
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
index 701bb1d01..01da32f1c 100644
--- a/youtube_dl/extractor/vidio.py
+++ b/youtube_dl/extractor/vidio.py
@@ -56,7 +56,8 @@ class VidioIE(InfoExtractor):
self._sort_formats(formats)
duration = int_or_none(duration or self._search_regex(
- r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
+ r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage,
+ 'duration', fatal=False, group='duration'))
thumbnail = thumbnail or self._og_search_thumbnail(webpage)
like_count = int_or_none(self._search_regex(
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index e9ff336c4..a7971d72e 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
import itertools
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
ExtractorError,
int_or_none,
@@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor):
'or for violating the terms of use.',
expected=True)
- formats = [{
- 'format_id': f.get('type'),
- 'url': f['uri'],
- 'width': int_or_none(f.get('width')),
- 'height': int_or_none(f.get('height')),
- 'preference': 0 if f.get('type', '').endswith('clip') else 1,
- } for f in video.get('formats', []) if f.get('uri')]
+ formats = []
+ for f in video.get('formats', []):
+ format_url = f.get('uri')
+ if not format_url or not isinstance(format_url, compat_str):
+ continue
+ format_type = f.get('type')
+ if format_type == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False))
+ elif format_type == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'format_id': f.get('type'),
+ 'url': format_url,
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'preference': 0 if f.get('type', '').endswith(
+ 'clip') else 1,
+ })
if not formats and video.get('complete_url'):
formats.append({
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index 77c120a57..64d0224e6 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -236,7 +236,12 @@ class VLiveChannelIE(InfoExtractor):
query={
'app_id': app_id,
'channelSeq': channel_seq,
- 'maxNumOfRows': 1000,
+ # Large values of maxNumOfRows (~300 or above) may cause
+ # empty responses (see [1]), e.g. this happens for [2] that
+ # has more than 300 videos.
+ # 1. https://github.com/rg3/youtube-dl/issues/13830
+ # 2. http://channels.vlive.tv/EDBF.
+ 'maxNumOfRows': 100,
'_': int(time.time()),
'pageNo': page_num
}
diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py
new file mode 100644
index 000000000..5de3deb8c
--- /dev/null
+++ b/youtube_dl/extractor/voot.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class VootIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)'
+ _GEO_COUNTRIES = ['IN']
+ _TESTS = [{
+ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353',
+ 'info_dict': {
+ 'id': '0_8ledb18o',
+ 'ext': 'mp4',
+ 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340',
+ 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1',
+ 'uploader_id': 'batchUser',
+ 'timestamp': 1472162937,
+ 'upload_date': '20160825',
+ 'duration': 1146,
+ 'series': 'Ishq Ka Rang Safed',
+ 'season_number': 1,
+ 'episode': 'Is this the end of Kamini?',
+ 'episode_number': 340,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download m3u8 information'],
+ }, {
+ 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.voot.com/movies/pandavas-5/424627',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ media_info = self._download_json(
+ 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id,
+ query={
+ 'platform': 'Web',
+ 'pId': 2,
+ 'mediaId': video_id,
+ })
+
+ status_code = try_get(media_info, lambda x: x['status']['code'], int)
+ if status_code != 0:
+ raise ExtractorError(media_info['status']['message'], expected=True)
+
+ media = media_info['assets']
+
+ entry_id = media['EntryId']
+ title = media['MediaName']
+
+ description, series, season_number, episode, episode_number = [None] * 5
+
+ for meta in try_get(media, lambda x: x['Metas'], list) or []:
+ key, value = meta.get('Key'), meta.get('Value')
+ if not key or not value:
+ continue
+ if key == 'ContentSynopsis':
+ description = value
+ elif key == 'RefSeriesTitle':
+ series = value
+ elif key == 'RefSeriesSeason':
+ season_number = int_or_none(value)
+ elif key == 'EpisodeMainTitle':
+ episode = value
+ elif key == 'EpisodeNo':
+ episode_number = int_or_none(value)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'kaltura:1982551:%s' % entry_id,
+ 'ie_key': KalturaIE.ie_key(),
+ 'title': title,
+ 'description': description,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'timestamp': unified_timestamp(media.get('CreationDate')),
+ 'duration': int_or_none(media.get('Duration')),
+ 'view_count': int_or_none(media.get('ViewCounter')),
+ 'like_count': int_or_none(media.get('like_counter')),
+ }
diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py
index b270f08d1..02fcd52c7 100644
--- a/youtube_dl/extractor/vzaar.py
+++ b/youtube_dl/extractor/vzaar.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -28,6 +30,12 @@ class VzaarIE(InfoExtractor):
},
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)',
+ webpage)
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py
new file mode 100644
index 000000000..b382338fa
--- /dev/null
+++ b/youtube_dl/extractor/watchbox.py
@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ strip_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class WatchBoxIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)'
+ _TESTS = [{
+ # film
+ 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html',
+ 'info_dict': {
+ 'id': '341368',
+ 'ext': 'mp4',
+ 'title': 'Free Jimmy',
+ 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 4890,
+ 'age_limit': 16,
+ 'release_year': 2009,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download m3u8 information'],
+ }, {
+ # episode
+ 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html',
+ 'info_dict': {
+ 'id': '328286',
+ 'ext': 'mp4',
+ 'title': 'S01 E01 - Date in der Hölle',
+ 'description': 'md5:2f31c74a8186899f33cb5114491dae2b',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1291,
+ 'age_limit': 12,
+ 'release_year': 2010,
+ 'series': 'Ugly Americans',
+ 'season_number': 1,
+ 'episode': 'Date in der Hölle',
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download m3u8 information'],
+ }, {
+ 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ kind, video_id = mobj.group('kind', 'id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ source = self._parse_json(
+ self._search_regex(
+ r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source',
+ default='{}'),
+ video_id, transform_source=js_to_json, fatal=False) or {}
+
+ video_id = compat_str(source.get('videoId') or video_id)
+
+ devapi = self._download_json(
+ 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={
+ 'format': 'json',
+ 'apikey': 'hbbtv',
+ }, fatal=False)
+
+ item = try_get(devapi, lambda x: x['items'][0], dict) or {}
+
+ title = item.get('title') or try_get(
+ item, lambda x: x['movie']['headline_movie'],
+ compat_str) or source['title']
+
+ formats = []
+ hls_url = item.get('media_videourl_hls') or source.get('hls')
+ if hls_url:
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ dash_url = item.get('media_videourl_wv') or source.get('dash')
+ if dash_url:
+ formats.extend(self._extract_mpd_formats(
+ dash_url, video_id, mpd_id='dash', fatal=False))
+ mp4_url = item.get('media_videourl')
+ if mp4_url:
+ formats.append({
+ 'url': mp4_url,
+ 'format_id': 'mp4',
+ 'width': int_or_none(item.get('width')),
+ 'height': int_or_none(item.get('height')),
+ 'tbr': int_or_none(item.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ description = strip_or_none(item.get('descr'))
+ thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail')
+ duration = int_or_none(item.get('media_length') or source.get('length'))
+ timestamp = unified_timestamp(item.get('pubDate'))
+ view_count = int_or_none(item.get('media_views'))
+ age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk']))
+ release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year']))
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'age_limit': age_limit,
+ 'release_year': release_year,
+ 'formats': formats,
+ }
+
+ if kind.lower() == 'serien':
+ series = try_get(
+ item, lambda x: x['special']['title'],
+ compat_str) or source.get('format')
+ season_number = int_or_none(self._search_regex(
+ r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number',
+ default=None) or self._search_regex(
+ r'/staffel-(\d+)/', url, 'season number', default=None))
+ episode = source.get('title')
+ episode_number = int_or_none(self._search_regex(
+ r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number',
+ default=None))
+ info.update({
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ })
+
+ return info
diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py
index 5c8f17eb2..e34ebe3a6 100644
--- a/youtube_dl/extractor/xxxymovies.py
+++ b/youtube_dl/extractor/xxxymovies.py
@@ -39,8 +39,8 @@ class XXXYMoviesIE(InfoExtractor):
r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
title = self._html_search_regex(
- [r'<div class="block_header">\s*<h1>([^<]+)</h1>',
- r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'],
+ [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<',
+ r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'],
webpage, 'title')
thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py
new file mode 100644
index 000000000..e8f6ae10f
--- /dev/null
+++ b/youtube_dl/extractor/yandexdisk.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class YandexDiskIE(InfoExtractor):
+ _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y',
+ 'md5': '33955d7ae052f15853dc41f35f17581c',
+ 'info_dict': {
+ 'id': 'VdOeDou8eZs6Y',
+ 'ext': 'mp4',
+ 'title': '4.mp4',
+ 'duration': 168.6,
+ 'uploader': 'y.botova',
+ 'uploader_id': '300043621',
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ status = self._download_webpage(
+ 'https://disk.yandex.com/auth/status', video_id, query={
+ 'urlOrigin': url,
+ 'source': 'public',
+ 'md5': 'false',
+ })
+
+ sk = self._search_regex(
+ r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2',
+ status, 'sk', group='value')
+
+ webpage = self._download_webpage(url, video_id)
+
+ models = self._parse_json(
+ self._search_regex(
+ r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script',
+ webpage, 'video JSON'),
+ video_id)
+
+ data = next(
+ model['data'] for model in models
+ if model.get('model') == 'resource')
+
+ video_hash = data['id']
+ title = data['name']
+
+ models = self._download_json(
+ 'https://disk.yandex.com/models/', video_id,
+ data=urlencode_postdata({
+ '_model.0': 'videoInfo',
+ 'id.0': video_hash,
+ '_model.1': 'do-get-resource-url',
+ 'id.1': video_hash,
+ 'version': '13.6',
+ 'sk': sk,
+ }), query={'_m': 'videoInfo'})['models']
+
+ videos = try_get(models, lambda x: x[0]['data']['videos'], list) or []
+ source_url = try_get(
+ models, lambda x: x[1]['data']['file'], compat_str)
+
+ formats = []
+ if source_url:
+ formats.append({
+ 'url': source_url,
+ 'format_id': 'source',
+ 'ext': determine_ext(title, 'mp4'),
+ 'quality': 1,
+ })
+ for video in videos:
+ format_url = video.get('url')
+ if not format_url:
+ continue
+ if determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': format_url,
+ })
+ self._sort_formats(formats)
+
+ duration = float_or_none(try_get(
+ models, lambda x: x[0]['data']['duration']), 1000)
+ uploader = try_get(
+ data, lambda x: x['user']['display_name'], compat_str)
+ uploader_id = try_get(
+ data, lambda x: x['user']['uid'], compat_str)
+ view_count = int_or_none(try_get(
+ data, lambda x: x['meta']['views_counter']))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index b50f34e9b..f33fabe19 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -1,39 +1,95 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_duration,
+)
class YouJizzIE(InfoExtractor):
- _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])'
+ _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'
_TESTS = [{
'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
- 'md5': '78fc1901148284c69af12640e01c6310',
+ 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',
'info_dict': {
'id': '2189178',
'ext': 'mp4',
'title': 'Zeichentrick 1',
'age_limit': 18,
+ 'duration': 2874,
}
}, {
'url': 'http://www.youjizz.com/videos/-2189178.html',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youjizz.com/videos/embed/31991001',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('embed_id')
+
webpage = self._download_webpage(url, video_id)
- # YouJizz's HTML5 player has invalid HTML
- webpage = webpage.replace('"controls', '" controls')
- age_limit = self._rta_search(webpage)
- video_title = self._html_search_regex(
- r'<title>\s*(.*)\s*</title>', webpage, 'title')
- info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
+ title = self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'title')
+
+ formats = []
+
+ encodings = self._parse_json(
+ self._search_regex(
+ r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
+ default='[]'),
+ video_id, fatal=False)
+ for encoding in encodings:
+ if not isinstance(encoding, dict):
+ continue
+ format_url = encoding.get('filename')
+ if not isinstance(format_url, compat_str):
+ continue
+ if determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ format_id = encoding.get('name') or encoding.get('quality')
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_id, 'height', default=None))
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': height,
+ })
+
+ if formats:
+ info_dict = {
+ 'formats': formats,
+ }
+ else:
+ # YouJizz's HTML5 player has invalid HTML
+ webpage = webpage.replace('"controls', '" controls')
+ info_dict = self._parse_html5_media_entries(
+ url, webpage, video_id)[0]
+
+ duration = parse_duration(self._search_regex(
+ r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',
+ default=None))
+ uploader = self._search_regex(
+ r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader',
+ default=None)
info_dict.update({
'id': video_id,
- 'title': video_title,
- 'age_limit': age_limit,
+ 'title': title,
+ 'age_limit': self._rta_search(webpage),
+ 'duration': duration,
+ 'uploader': uploader,
})
return info_dict
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index dcce15d77..0c4bc2eda 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import itertools
import random
import re
import string
@@ -14,7 +13,6 @@ from ..utils import (
js_to_json,
str_or_none,
strip_jsonp,
- urljoin,
)
@@ -222,17 +220,42 @@ class YoukuShowIE(InfoExtractor):
_VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
IE_NAME = 'youku:show'
- _TEST = {
+ _TESTS = [{
'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
'info_dict': {
'id': 'zc7c670be07ff11e48b3f',
- 'title': '花千骨 未删减版',
+ 'title': '花千骨 DVD版',
'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
},
'playlist_count': 50,
- }
+ }, {
+ # Episode number not starting from 1
+ 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
+ 'info_dict': {
+ 'id': 'zefbfbd70efbfbd780bef',
+ 'title': '超级飞侠3',
+ 'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
+ },
+ 'playlist_count': 24,
+ }, {
+ # Ongoing playlist. The initial page is the last one
+ 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
+ 'only_matchine': True,
+ }]
- _PAGE_SIZE = 40
+ def _extract_entries(self, playlist_data_url, show_id, note, query):
+ query['callback'] = 'cb'
+ playlist_data = self._download_json(
+ playlist_data_url, show_id, query=query, note=note,
+ transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
+ drama_list = (get_element_by_class('p-drama-grid', playlist_data) or
+ get_element_by_class('p-drama-half-row', playlist_data))
+ if drama_list is None:
+ raise ExtractorError('No episodes found')
+ video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
+ return playlist_data, [
+ self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
+ for video_url in video_urls]
def _real_extract(self, url):
show_id = self._match_id(url)
@@ -242,30 +265,29 @@ class YoukuShowIE(InfoExtractor):
page_config = self._parse_json(self._search_regex(
r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
show_id, transform_source=js_to_json)
- for idx in itertools.count(0):
- if idx == 0:
- playlist_data_url = 'http://list.youku.com/show/module'
- query = {'id': page_config['showid'], 'tab': 'point'}
- else:
- playlist_data_url = 'http://list.youku.com/show/point'
- query = {
- 'id': page_config['showid'],
- 'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1),
- }
- query['callback'] = 'cb'
- playlist_data = self._download_json(
- playlist_data_url, show_id, query=query,
+ first_page, initial_entries = self._extract_entries(
+ 'http://list.youku.com/show/module', show_id,
+ note='Downloading initial playlist data page',
+ query={
+ 'id': page_config['showid'],
+ 'tab': 'showInfo',
+ })
+ first_page_reload_id = self._html_search_regex(
+ r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
+ # The first reload_id has the same items as first_page
+ reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
+ for idx, reload_id in enumerate(reload_ids):
+ if reload_id == first_page_reload_id:
+ entries.extend(initial_entries)
+ continue
+ _, new_entries = self._extract_entries(
+ 'http://list.youku.com/show/episode', show_id,
note='Downloading playlist data page %d' % (idx + 1),
- transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
- video_urls = re.findall(
- r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"',
- playlist_data)
- new_entries = [
- self.url_result(urljoin(url, video_url), YoukuIE.ie_key())
- for video_url in video_urls]
+ query={
+ 'id': page_config['showid'],
+ 'stage': reload_id,
+ })
entries.extend(new_entries)
- if len(new_entries) < self._PAGE_SIZE:
- break
desc = self._html_search_meta('description', webpage, fatal=False)
playlist_title = desc.split(',')[0] if desc else None