aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py11
-rw-r--r--youtube_dl/extractor/adultswim.py37
-rw-r--r--youtube_dl/extractor/bbc.py20
-rw-r--r--youtube_dl/extractor/comedycentral.py7
-rw-r--r--youtube_dl/extractor/common.py43
-rw-r--r--youtube_dl/extractor/condenast.py25
-rw-r--r--youtube_dl/extractor/eagleplatform.py27
-rw-r--r--youtube_dl/extractor/engadget.py2
-rw-r--r--youtube_dl/extractor/europa.py93
-rw-r--r--youtube_dl/extractor/fktv.py89
-rw-r--r--youtube_dl/extractor/generic.py12
-rw-r--r--youtube_dl/extractor/keek.py39
-rw-r--r--youtube_dl/extractor/kuwo.py5
-rw-r--r--youtube_dl/extractor/limelight.py229
-rw-r--r--youtube_dl/extractor/mtv.py15
-rw-r--r--youtube_dl/extractor/naver.py11
-rw-r--r--youtube_dl/extractor/nfl.py16
-rw-r--r--youtube_dl/extractor/nrk.py30
-rw-r--r--youtube_dl/extractor/pbs.py20
-rw-r--r--youtube_dl/extractor/qqmusic.py37
-rw-r--r--youtube_dl/extractor/ruutu.py2
-rw-r--r--youtube_dl/extractor/tapely.py6
-rw-r--r--youtube_dl/extractor/videolecturesnet.py104
-rw-r--r--youtube_dl/extractor/vk.py5
24 files changed, 663 insertions, 222 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 7272859db..3ace1cc2c 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -158,6 +158,7 @@ from .eroprofile import EroProfileIE
from .escapist import EscapistIE
from .espn import ESPNIE
from .esri import EsriVideoIE
+from .europa import EuropaIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
@@ -169,10 +170,7 @@ from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
from .fivetv import FiveTVIE
-from .fktv import (
- FKTVIE,
- FKTVPosteckeIE,
-)
+from .fktv import FKTVIE
from .flickr import FlickrIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
@@ -297,6 +295,11 @@ from .lifenews import (
LifeNewsIE,
LifeEmbedIE,
)
+from .limelight import (
+ LimelightMediaIE,
+ LimelightChannelIE,
+ LimelightChannelListIE,
+)
from .liveleak import LiveLeakIE
from .livestream import (
LivestreamIE,
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 4327c2f61..27de07587 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
ExtractorError,
float_or_none,
xpath_text,
@@ -123,7 +124,6 @@ class AdultSwimIE(InfoExtractor):
else:
collections = bootstrapped_data['show']['collections']
collection, video_info = self.find_collection_containing_video(collections, episode_path)
-
# Video wasn't found in the collections, let's try `slugged_video`.
if video_info is None:
if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
@@ -133,7 +133,9 @@ class AdultSwimIE(InfoExtractor):
show = bootstrapped_data['show']
show_title = show['title']
- segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
+ stream = video_info.get('stream')
+ clips = [stream] if stream else video_info['clips']
+ segment_ids = [clip['videoPlaybackID'] for clip in clips]
episode_id = video_info['id']
episode_title = video_info['title']
@@ -142,7 +144,7 @@ class AdultSwimIE(InfoExtractor):
entries = []
for part_num, segment_id in enumerate(segment_ids):
- segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id
+ segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id
segment_title = '%s - %s' % (show_title, episode_title)
if len(segment_ids) > 1:
@@ -158,17 +160,30 @@ class AdultSwimIE(InfoExtractor):
formats = []
file_els = idoc.findall('.//files/file') or idoc.findall('./files/file')
+ unique_urls = []
+ unique_file_els = []
for file_el in file_els:
+ media_url = file_el.text
+ if not media_url or determine_ext(media_url) == 'f4m':
+ continue
+ if file_el.text not in unique_urls:
+ unique_urls.append(file_el.text)
+ unique_file_els.append(file_el)
+
+ for file_el in unique_file_els:
bitrate = file_el.attrib.get('bitrate')
ftype = file_el.attrib.get('type')
-
- formats.append({
- 'format_id': '%s_%s' % (bitrate, ftype),
- 'url': file_el.text.strip(),
- # The bitrate may not be a number (for example: 'iphone')
- 'tbr': int(bitrate) if bitrate.isdigit() else None,
- 'quality': 1 if ftype == 'hd' else -1
- })
+ media_url = file_el.text
+ if determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, segment_title, 'mp4', 'm3u8_native', preference=0, m3u8_id='hls'))
+ else:
+ formats.append({
+ 'format_id': '%s_%s' % (bitrate, ftype),
+ 'url': file_el.text.strip(),
+ # The bitrate may not be a number (for example: 'iphone')
+ 'tbr': int(bitrate) if bitrate.isdigit() else None,
+ })
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 42526357a..cc2f6fed2 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -21,6 +21,9 @@ class BBCCoUkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
_MEDIASELECTOR_URLS = [
+ # Provides HQ HLS streams with even better quality that pc mediaset but fails
+ # with geolocation in some cases when it's even not geo restricted at all (e.g.
+ # http://www.bbc.co.uk/programmes/b06bp7lf)
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
]
@@ -154,6 +157,21 @@ class BBCCoUkIE(InfoExtractor):
},
'skip': 'geolocation',
}, {
+ # iptv-all mediaset fails with geolocation however there is no geo restriction
+ # for this programme at all
+ 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf',
+ 'info_dict': {
+ 'id': 'b06bp7kf',
+ 'ext': 'flv',
+ 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie",
+ 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.',
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
}, {
@@ -294,7 +312,7 @@ class BBCCoUkIE(InfoExtractor):
return self._download_media_selector_url(
mediaselector_url % programme_id, programme_id)
except BBCCoUkIE.MediaSelectionError as e:
- if e.id == 'notukerror':
+ if e.id in ('notukerror', 'geolocation'):
last_exception = e
continue
self._raise_extractor_error(e)
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 91ebb0ce5..3e4bd10b6 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -151,12 +151,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
mobj = re.match(self._VALID_URL, url)
if mobj.group('shortname'):
- if mobj.group('shortname') in ('tds', 'thedailyshow'):
- url = 'http://thedailyshow.cc.com/full-episodes/'
- else:
- url = 'http://thecolbertreport.cc.com/full-episodes/'
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- assert mobj is not None
+ return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes')
if mobj.group('clip'):
if mobj.group('videotitle'):
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 1e7db8a9b..dbae75406 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -39,6 +39,7 @@ from ..utils import (
RegexNotFoundError,
sanitize_filename,
unescapeHTML,
+ unified_strdate,
url_basename,
xpath_text,
xpath_with_ns,
@@ -152,6 +153,7 @@ class InfoExtractor(object):
description: Full video description.
uploader: Full name of the video uploader.
creator: The main artist who created the video.
+ release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
@@ -163,6 +165,7 @@ class InfoExtractor(object):
with the "ext" entry and one of:
* "data": The subtitles file contents
* "url": A URL pointing to the subtitles file
+ "ext" will be calculated from URL if missing
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer.
@@ -868,13 +871,18 @@ class InfoExtractor(object):
time.sleep(timeout)
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
- transform_source=lambda s: fix_xml_ampersands(s).strip()):
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
- transform_source=transform_source)
+ transform_source=transform_source,
+ fatal=fatal)
+
+ if manifest is False:
+ return manifest
formats = []
manifest_version = '1.0'
@@ -895,7 +903,10 @@ class InfoExtractor(object):
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
if determine_ext(manifest_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+ f4m_formats = self._extract_f4m_formats(
+ manifest_url, video_id, preference, f4m_id, fatal=fatal)
+ if f4m_formats:
+ formats.extend(f4m_formats)
continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
@@ -1043,6 +1054,7 @@ class InfoExtractor(object):
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
description = None
+ upload_date = None
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
name = meta.attrib.get('name')
content = meta.attrib.get('content')
@@ -1052,11 +1064,22 @@ class InfoExtractor(object):
title = content
elif not description and name in ('description', 'abstract'):
description = content
+ elif not upload_date and name == 'date':
+ upload_date = unified_strdate(content)
+
+ thumbnails = [{
+ 'id': image.get('type'),
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
return {
'id': video_id,
'title': title or video_id,
'description': description,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
'formats': formats,
'subtitles': subtitles,
}
@@ -1083,7 +1106,7 @@ class InfoExtractor(object):
if not src:
continue
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
@@ -1115,8 +1138,10 @@ class InfoExtractor(object):
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
if proto == 'm3u8' or src_ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+ m3u8_formats = self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
continue
if src_ext == 'f4m':
@@ -1128,10 +1153,12 @@ class InfoExtractor(object):
}
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse.urlencode(f4m_params)
- formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+ f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
continue
- if src_url.startswith('http'):
+ if src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'url': src_url,
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index d6949ca28..6f92ae2ed 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -11,6 +11,7 @@ from ..compat import (
)
from ..utils import (
orderedSet,
+ remove_end,
)
@@ -44,12 +45,12 @@ class CondeNastIE(InfoExtractor):
'wmagazine': 'W Magazine',
}
- _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
- EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed)/.+?' % '|'.join(_SITES.keys())
+ EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
- _TEST = {
+ _TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
'md5': '1921f713ed48aabd715691f774c451f7',
'info_dict': {
@@ -58,7 +59,16 @@ class CondeNastIE(InfoExtractor):
'title': '3D Printed Speakers Lit With LED',
'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
}
- }
+ }, {
+ # JS embed
+ 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
+ 'md5': 'f1a6f9cafb7083bab74a710f65d08999',
+ 'info_dict': {
+ 'id': '55f9cf8b61646d1acf00000c',
+ 'ext': 'mp4',
+ 'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
+ }
+ }]
def _extract_series(self, url, webpage):
title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>',
@@ -122,6 +132,13 @@ class CondeNastIE(InfoExtractor):
url_type = mobj.group('type')
item_id = mobj.group('id')
+ # Convert JS embed to regular embed
+ if url_type == 'embedjs':
+ parsed_url = compat_urlparse.urlparse(url)
+ url = compat_urlparse.urlunparse(parsed_url._replace(
+ path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
+ url_type = 'embed'
+
self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
webpage = self._download_webpage(url, item_id)
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
index a1ee51568..e529b9b96 100644
--- a/youtube_dl/extractor/eagleplatform.py
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -21,7 +21,7 @@ class EaglePlatformIE(InfoExtractor):
_TESTS = [{
# http://lenta.ru/news/2015/03/06/navalny/
'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
- 'md5': '0b7994faa2bd5c0f69a3db6db28d078d',
+ 'md5': '70f5187fb620f2c1d503b3b22fd4efe3',
'info_dict': {
'id': '227304',
'ext': 'mp4',
@@ -36,7 +36,7 @@ class EaglePlatformIE(InfoExtractor):
# http://muz-tv.ru/play/7129/
# http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true
'url': 'eagleplatform:media.clipyou.ru:12820',
- 'md5': '6c2ebeab03b739597ce8d86339d5a905',
+ 'md5': '90b26344ba442c8e44aa4cf8f301164a',
'info_dict': {
'id': '12820',
'ext': 'mp4',
@@ -48,7 +48,8 @@ class EaglePlatformIE(InfoExtractor):
'skip': 'Georestricted',
}]
- def _handle_error(self, response):
+ @staticmethod
+ def _handle_error(response):
status = int_or_none(response.get('status', 200))
if status != 200:
raise ExtractorError(' '.join(response['errors']), expected=True)
@@ -58,6 +59,9 @@ class EaglePlatformIE(InfoExtractor):
self._handle_error(response)
return response
+ def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):
+ return self._download_json(url_or_request, video_id, note)['data'][0]
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
@@ -69,7 +73,7 @@ class EaglePlatformIE(InfoExtractor):
title = media['title']
description = media.get('description')
- thumbnail = media.get('snapshot')
+ thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:')
duration = int_or_none(media.get('duration'))
view_count = int_or_none(media.get('views'))
@@ -78,13 +82,20 @@ class EaglePlatformIE(InfoExtractor):
if age_restriction:
age_limit = 0 if age_restriction == 'allow_all' else 18
- m3u8_data = self._download_json(
- self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:'),
- video_id, 'Downloading m3u8 JSON')
+ secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')
+ m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
formats = self._extract_m3u8_formats(
- m3u8_data['data'][0], video_id,
+ m3u8_url, video_id,
'mp4', entry_protocol='m3u8_native')
+
+ mp4_url = self._get_video_url(
+ # Secure mp4 URL is constructed according to Player.prototype.mp4 from
+ # http://lentaru.media.eagleplatform.com/player/player.js
+ re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4', secure_m3u8),
+ video_id, 'Downloading mp4 JSON')
+ formats.append({'url': mp4_url, 'format_id': 'mp4'})
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index 4ea37ebd9..e4180701d 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -10,7 +10,7 @@ from ..utils import (
class EngadgetIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://www.engadget.com/
- (?:video/5min/(?P<id>\d+)|
+ (?:video(?:/5min)?/(?P<id>\d+)|
[\d/]+/.*?)
'''
diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py
new file mode 100644
index 000000000..adc43919e
--- /dev/null
+++ b/youtube_dl/extractor/europa.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ orderedSet,
+ parse_duration,
+ qualities,
+ unified_strdate,
+ xpath_text
+)
+
+
+class EuropaIE(InfoExtractor):
+ _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
+ 'md5': '574f080699ddd1e19a675b0ddf010371',
+ 'info_dict': {
+ 'id': 'I107758',
+ 'ext': 'mp4',
+ 'title': 'TRADE - Wikileaks on TTIP',
+ 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20150811',
+ 'duration': 34,
+ 'view_count': int,
+ 'formats': 'mincount:3',
+ }
+ }, {
+ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ playlist = self._download_xml(
+ 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id)
+
+ def get_item(type_, preference):
+ items = {}
+ for item in playlist.findall('./info/%s/item' % type_):
+ lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+ if lang and label:
+ items[lang] = label.strip()
+ for p in preference:
+ if items.get(p):
+ return items[p]
+
+ query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ preferred_lang = query.get('sitelang', ('en', ))[0]
+
+ preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
+
+ title = get_item('title', preferred_langs) or video_id
+ description = get_item('description', preferred_langs)
+ thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail')
+ upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
+ duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
+ view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
+
+ language_preference = qualities(preferred_langs[::-1])
+
+ formats = []
+ for file_ in playlist.findall('./files/file'):
+ video_url = xpath_text(file_, './url')
+ if not video_url:
+ continue
+ lang = xpath_text(file_, './lg')
+ formats.append({
+ 'url': video_url,
+ 'format_id': lang,
+ 'format_note': xpath_text(file_, './lglabel'),
+ 'language_preference': language_preference(lang)
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnmail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index 190d9f9ad..40ea27895 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -1,13 +1,12 @@
from __future__ import unicode_literals
import re
-import random
-import json
from .common import InfoExtractor
from ..utils import (
- get_element_by_id,
clean_html,
+ determine_ext,
+ ExtractorError,
)
@@ -17,66 +16,40 @@ class FKTVIE(InfoExtractor):
_TEST = {
'url': 'http://fernsehkritik.tv/folge-1',
+ 'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79',
'info_dict': {
- 'id': '00011',
- 'ext': 'flv',
+ 'id': '1',
+ 'ext': 'mp4',
'title': 'Folge 1 vom 10. April 2007',
- 'description': 'md5:fb4818139c7cfe6907d4b83412a6864f',
+ 'thumbnail': 're:^https?://.*\.jpg$',
},
}
def _real_extract(self, url):
- episode = int(self._match_id(url))
-
- video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode
- start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode,
- episode)
- playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage,
- 'playlist', flags=re.DOTALL)
- files = json.loads(re.sub('{[^{}]*?}', '{}', playlist))
-
- videos = []
- for i, _ in enumerate(files, 1):
- video_id = '%04d%d' % (episode, i)
- video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i)
- videos.append({
- 'ext': 'flv',
- 'id': video_id,
- 'url': video_url,
- 'title': clean_html(get_element_by_id('eptitle', start_webpage)),
- 'description': clean_html(get_element_by_id('contentlist', start_webpage)),
- 'thumbnail': video_thumbnail
- })
- return {
- '_type': 'multi_video',
- 'entries': videos,
- 'id': 'folge-%s' % episode,
- }
-
-
-class FKTVPosteckeIE(InfoExtractor):
- IE_NAME = 'fernsehkritik.tv:postecke'
- _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
- _TEST = {
- 'url': 'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
- 'md5': '262f0adbac80317412f7e57b4808e5c4',
- 'info_dict': {
- 'id': '0120',
- 'ext': 'flv',
- 'title': 'Postecke 120',
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = int(mobj.group('ep'))
-
- server = random.randint(2, 4)
- video_id = '%04d' % episode
- video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode)
- video_title = 'Postecke %d' % episode
+ episode = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://fernsehkritik.tv/folge-%s/play' % episode, episode)
+ title = clean_html(self._html_search_regex(
+ '<h3>([^<]+)</h3>', webpage, 'title'))
+ matches = re.search(
+ r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>',
+ webpage)
+ if matches is None:
+ raise ExtractorError('Unable to extract the video')
+
+ poster, sources = matches.groups()
+ if poster is None:
+ self.report_warning('unable to extract thumbnail')
+
+ urls = re.findall(r'<source[^>]+src="([^"]+)"', sources)
+ formats = [{
+ 'url': furl,
+ 'format_id': determine_ext(furl),
+ } for furl in urls]
return {
- 'id': video_id,
- 'url': video_url,
- 'title': video_title,
+ 'id': episode,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': poster,
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 8881a8a23..ca5fbafb2 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -50,6 +50,7 @@ from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
from .snagfilms import SnagFilmsEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
+from .mtv import MTVServicesEmbeddedIE
class GenericIE(InfoExtractor):
@@ -1611,12 +1612,9 @@ class GenericIE(InfoExtractor):
return self.url_result(url, ie='Vulture')
# Look for embedded mtvservices player
- mobj = re.search(
- r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
- webpage)
- if mobj is not None:
- url = unescapeHTML(mobj.group('url'))
- return self.url_result(url, ie='MTVServicesEmbedded')
+ mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
+ if mtvservices_url:
+ return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
# Look for embedded yahoo player
mobj = re.search(
@@ -1655,7 +1653,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'MLB')
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+ r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
webpage)
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
index c0956ba09..94a03d277 100644
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -1,46 +1,39 @@
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class KeekIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<id>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?keek\.com/keek/(?P<id>\w+)'
IE_NAME = 'keek'
_TEST = {
- 'url': 'https://www.keek.com/ytdl/keeks/NODfbab',
- 'md5': '09c5c109067536c1cec8bac8c21fea05',
+ 'url': 'https://www.keek.com/keek/NODfbab',
+ 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83',
'info_dict': {
'id': 'NODfbab',
'ext': 'mp4',
- 'uploader': 'youtube-dl project',
- 'uploader_id': 'ytdl',
- 'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .',
+ 'title': 'md5:35d42050a3ece241d5ddd7fdcc6fd896',
+ 'uploader': 'ytdl',
+ 'uploader_id': 'eGT5bab',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
- video_url = 'http://cdn.keek.com/keek/video/%s' % video_id
- thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- raw_desc = self._html_search_meta('description', webpage)
- if raw_desc:
- uploader = self._html_search_regex(
- r'Watch (.*?)\s+\(', raw_desc, 'uploader', fatal=False)
- uploader_id = self._html_search_regex(
- r'Watch .*?\(@(.+?)\)', raw_desc, 'uploader_id', fatal=False)
- else:
- uploader = None
- uploader_id = None
-
return {
'id': video_id,
- 'url': video_url,
+ 'url': self._og_search_video_url(webpage),
'ext': 'mp4',
- 'title': self._og_search_title(webpage),
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
+ 'title': self._og_search_description(webpage).strip(),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': self._search_regex(
+ r'data-username=(["\'])(?P<uploader>.+?)\1', webpage,
+ 'uploader', fatal=False, group='uploader'),
+ 'uploader_id': self._search_regex(
+ r'data-user-id=(["\'])(?P<uploader_id>.+?)\1', webpage,
+ 'uploader id', fatal=False, group='uploader_id'),
}
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index fa233377d..0c8ed5d07 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -57,6 +57,7 @@ class KuwoIE(KuwoBaseIE):
'upload_date': '20080122',
'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
},
+ 'skip': 'this song has been offline because of copyright issues',
}, {
'url': 'http://www.kuwo.cn/yinyue/6446136/',
'info_dict': {
@@ -76,9 +77,11 @@ class KuwoIE(KuwoBaseIE):
webpage = self._download_webpage(
url, song_id, note='Download song detail info',
errnote='Unable to get song detail info')
+ if '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage:
+ raise ExtractorError('this song has been offline because of copyright issues', expected=True)
song_name = self._html_search_regex(
- r'<h1[^>]+title="([^"]+)">', webpage, 'song name')
+ r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?<h1[^>]+title="([^"]+)"', webpage, 'song name')
singer_name = self._html_search_regex(
r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
webpage, 'singer name', fatal=False)
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
new file mode 100644
index 000000000..fb03dd527
--- /dev/null
+++ b/youtube_dl/extractor/limelight.py
@@ -0,0 +1,229 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+)
+
+
+class LimelightBaseIE(InfoExtractor):
+ _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
+ _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
+
+ def _call_playlist_service(self, item_id, method, fatal=True):
+ return self._download_json(
+ self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method),
+ item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal)
+
+ def _call_api(self, organization_id, item_id, method):
+ return self._download_json(
+ self._API_URL % (organization_id, self._API_PATH, item_id, method),
+ item_id, 'Downloading API %s JSON' % method)
+
+ def _extract(self, item_id, pc_method, mobile_method, meta_method):
+ pc = self._call_playlist_service(item_id, pc_method)
+ metadata = self._call_api(pc['orgId'], item_id, meta_method)
+ mobile = self._call_playlist_service(item_id, mobile_method, fatal=False)
+ return pc, mobile, metadata
+
+ def _extract_info(self, streams, mobile_urls, properties):
+ video_id = properties['media_id']
+ formats = []
+
+ for stream in streams:
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ if '.f4m' in stream_url:
+ formats.extend(self._extract_f4m_formats(stream_url, video_id))
+ else:
+ fmt = {
+ 'url': stream_url,
+ 'abr': float_or_none(stream.get('audioBitRate')),
+ 'vbr': float_or_none(stream.get('videoBitRate')),
+ 'fps': float_or_none(stream.get('videoFrameRate')),
+ 'width': int_or_none(stream.get('videoWidthInPixels')),
+ 'height': int_or_none(stream.get('videoHeightInPixels')),
+ 'ext': determine_ext(stream_url)
+ }
+ rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
+ if rtmp:
+ format_id = 'rtmp'
+ if stream.get('videoBitRate'):
+ format_id += '-%d' % int_or_none(stream['videoBitRate'])
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': format_id,
+ })
+ formats.append(fmt)
+
+ for mobile_url in mobile_urls:
+ media_url = mobile_url.get('mobileUrl')
+ if not media_url:
+ continue
+ format_id = mobile_url.get('targetMediaPlatform')
+ if determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ preference=-1, m3u8_id=format_id))
+ else:
+ formats.append({
+ 'url': media_url,
+ 'format_id': format_id,
+ 'preference': -1,
+ })
+
+ self._sort_formats(formats)
+
+ title = properties['title']
+ description = properties.get('description')
+ timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date'))
+ duration = float_or_none(properties.get('duration_in_milliseconds'), 1000)
+ filesize = int_or_none(properties.get('total_storage_in_bytes'))
+ categories = [properties.get('category')]
+ tags = properties.get('tags', [])
+ thumbnails = [{
+ 'url': thumbnail['url'],
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')]
+
+ subtitles = {}
+ for caption in properties.get('captions', {}):
+ lang = caption.get('language_code')
+ subtitles_url = caption.get('url')
+ if lang and subtitles_url:
+ subtitles[lang] = [{
+ 'url': subtitles_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'filesize': filesize,
+ 'categories': categories,
+ 'tags': tags,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
+ }
+
+
+class LimelightMediaIE(LimelightBaseIE):
+ IE_NAME = 'limelight'
+ _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})'
+ _TESTS = [{
+ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
+ 'info_dict': {
+ 'id': '3ffd040b522b4485b6d84effc750cd86',
+ 'ext': 'flv',
+ 'title': 'HaP and the HB Prince Trailer',
+ 'description': 'md5:8005b944181778e313d95c1237ddb640',
+ 'thumbnail': 're:^https?://.*\.jpeg$',
+ 'duration': 144.23,
+ 'timestamp': 1244136834,
+ 'upload_date': '20090604',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # video with subtitles
+ 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335',
+ 'info_dict': {
+ 'id': 'a3e00274d4564ec4a9b29b9466432335',
+ 'ext': 'flv',
+ 'title': '3Play Media Overview Video',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpeg$',
+ 'duration': 78.101,
+ 'timestamp': 1338929955,
+ 'upload_date': '20120605',
+ 'subtitles': 'mincount:9',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }]
+ _PLAYLIST_SERVICE_PATH = 'media'
+ _API_PATH = 'media'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ pc, mobile, metadata = self._extract(
+ video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties')
+
+ return self._extract_info(
+ pc['playlistItems'][0].get('streams', []),
+ mobile['mediaList'][0].get('mobileUrls', []) if mobile else [],
+ metadata)
+
+
+class LimelightChannelIE(LimelightBaseIE):
+ IE_NAME = 'limelight:channel'
+ _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})'
+ _TEST = {
+ 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082',
+ 'info_dict': {
+ 'id': 'ab6a524c379342f9b23642917020c082',
+ 'title': 'Javascript Sample Code',
+ },
+ 'playlist_mincount': 3,
+ }
+ _PLAYLIST_SERVICE_PATH = 'channel'
+ _API_PATH = 'channels'
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ pc, mobile, medias = self._extract(
+ channel_id, 'getPlaylistByChannelId',
+ 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media')
+
+ entries = [
+ self._extract_info(
+ pc['playlistItems'][i].get('streams', []),
+ mobile['mediaList'][i].get('mobileUrls', []) if mobile else [],
+ medias['media_list'][i])
+ for i in range(len(medias['media_list']))]
+
+ return self.playlist_result(entries, channel_id, pc['title'])
+
+
+class LimelightChannelListIE(LimelightBaseIE):
+ IE_NAME = 'limelight:channel_list'
+ _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})'
+ _TEST = {
+ 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b',
+ 'info_dict': {
+ 'id': '301b117890c4465c8179ede21fd92e2b',
+ 'title': 'Website - Hero Player',
+ },
+ 'playlist_mincount': 2,
+ }
+ _PLAYLIST_SERVICE_PATH = 'channel_list'
+
+ def _real_extract(self, url):
+ channel_list_id = self._match_id(url)
+
+ channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById')
+
+ entries = [
+ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel')
+ for channel in channel_list['channelList']]
+
+ return self.playlist_result(entries, channel_list_id, channel_list['title'])
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index a597714e9..302c9bf35 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -200,7 +200,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
if mgid is None or ':' not in mgid:
mgid = self._search_regex(
[r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],
- webpage, 'mgid')
+ webpage, 'mgid', default=None)
+
+ if not mgid:
+ sm4_embed = self._html_search_meta(
+ 'sm4:video:embed', webpage, 'sm4 embed', default='')
+ mgid = self._search_regex(
+ r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid')
videos_info = self._get_videos_info(mgid)
return videos_info
@@ -222,6 +228,13 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
},
}
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
def _get_feed_url(self, uri):
video_id = self._id_from_uri(uri)
site_id = uri.replace(video_id, '')
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 925967753..1f5fc2145 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -10,7 +10,6 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
- clean_html,
)
@@ -46,11 +45,11 @@ class NaverIE(InfoExtractor):
m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
webpage)
if m_id is None:
- m_error = re.search(
- r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
- webpage)
- if m_error:
- raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
+ error = self._html_search_regex(
+ r'(?s)<div class="(?:nation_error|nation_box|error_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
raise ExtractorError('couldn\'t extract vid and key')
vid = m_id.group(1)
key = m_id.group(2)
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index 55dc6107d..200874d68 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -108,6 +108,20 @@ class NFLIE(InfoExtractor):
'upload_date': '20150918',
},
}, {
+ # lowercase data-contentid
+ 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',
+ 'info_dict': {
+ 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',
+ 'ext': 'mp4',
+ 'title': 'Tomlin looks ahead to Ravens on a short week',
+ 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',
+ 'timestamp': 1443459651,
+ 'upload_date': '20150928',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
'only_matching': True,
}, {
@@ -151,7 +165,7 @@ class NFLIE(InfoExtractor):
group='config'))
# For articles, the id in the url is not the video id
video_id = self._search_regex(
- r'(?:<nflcs:avplayer[^>]+data-contentId\s*=\s*|contentId\s*:\s*)(["\'])(?P<id>.+?)\1',
+ r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>.+?)\1',
webpage, 'video id', default=video_id, group='id')
config = self._download_json(config_url, video_id, 'Downloading player config')
url_template = NFLIE.prepend_host(
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index d066a96db..8ac38a174 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
float_or_none,
@@ -49,7 +50,7 @@ class NRKIE(InfoExtractor):
if data['usageRights']['isGeoBlocked']:
raise ExtractorError(
- 'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge',
+ 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
expected=True)
video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81'
@@ -196,20 +197,6 @@ class NRKTVIE(InfoExtractor):
}
]
- def _debug_print(self, txt):
- if self._downloader.params.get('verbose', False):
- self.to_screen('[debug] %s' % txt)
-
- def _get_subtitles(self, subtitlesurl, video_id, baseurl):
- url = "%s%s" % (baseurl, subtitlesurl)
- self._debug_print('%s: Subtitle url: %s' % (video_id, url))
- captions = self._download_xml(
- url, video_id, 'Downloading subtitles')
- lang = captions.get('lang', 'no')
- return {lang: [
- {'ext': 'ttml', 'url': url},
- ]}
-
def _extract_f4m(self, manifest_url, video_id):
return self._extract_f4m_formats(
manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
@@ -218,7 +205,7 @@ class NRKTVIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
part_id = mobj.group('part_id')
- baseurl = mobj.group('baseurl')
+ base_url = mobj.group('baseurl')
webpage = self._download_webpage(url, video_id)
@@ -278,11 +265,14 @@ class NRKTVIE(InfoExtractor):
self._sort_formats(formats)
subtitles_url = self._html_search_regex(
- r'data-subtitlesurl[ ]*=[ ]*"([^"]+)"',
- webpage, 'subtitle URL', default=None)
- subtitles = None
+ r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
+ webpage, 'subtitle URL', default=None, group='url')
+ subtitles = {}
if subtitles_url:
- subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl)
+ subtitles['no'] = [{
+ 'ext': 'ttml',
+ 'url': compat_urlparse.urljoin(base_url, subtitles_url),
+ }]
return {
'id': video_id,
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 683c81de3..6923c6094 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -134,6 +134,24 @@ class PBSIE(InfoExtractor):
'params': {
'skip_download': True, # requires ffmpeg
},
+ },
+ {
+ # Video embedded in iframe containing angle brackets as attribute's value (e.g.
+ # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
+ # https://github.com/rg3/youtube-dl/issues/7059)
+ 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
+ 'info_dict': {
+ 'id': '2365546844',
+ 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
+ 'ext': 'mp4',
+ 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
+ 'description': 'md5:61db2ddf27c9912f09c241014b118ed1',
+ 'duration': 1480,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}
]
@@ -167,7 +185,7 @@ class PBSIE(InfoExtractor):
return media_id, presumptive_id, upload_date
url = self._search_regex(
- r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
+ r'(?s)<iframe[^>]+?(?:[a-z-]+?=["\'].*?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
webpage, 'player URL')
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 1654a641f..c98539f6a 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -25,7 +25,7 @@ class QQMusicIE(InfoExtractor):
'id': '004295Et37taLD',
'ext': 'mp3',
'title': '可惜没如果',
- 'upload_date': '20141227',
+ 'release_date': '20141227',
'creator': '林俊杰',
'description': 'md5:d327722d0361576fde558f1ac68a7065',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -38,11 +38,26 @@ class QQMusicIE(InfoExtractor):
'id': '004MsGEo3DdNxV',
'ext': 'mp3',
'title': '如果',
- 'upload_date': '20050626',
+ 'release_date': '20050626',
'creator': '李季美',
'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
'thumbnail': 're:^https?://.*\.jpg$',
}
+ }, {
+ 'note': 'lyrics not in .lrc format',
+ 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6',
+ 'info_dict': {
+ 'id': '001JyApY11tIp6',
+ 'ext': 'mp3',
+ 'title': 'Shadows Over Transylvania',
+ 'release_date': '19970225',
+ 'creator': 'Dark Funeral',
+ 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
_FORMATS = {
@@ -112,15 +127,27 @@ class QQMusicIE(InfoExtractor):
self._check_formats(formats, mid)
self._sort_formats(formats)
- return {
+ actual_lrc_lyrics = ''.join(
+ line + '\n' for line in re.findall(
+ r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content))
+
+ info_dict = {
'id': mid,
'formats': formats,
'title': song_name,
- 'upload_date': publish_time,
+ 'release_date': publish_time,
'creator': singer,
'description': lrc_content,
- 'thumbnail': thumbnail_url,
+ 'thumbnail': thumbnail_url
}
+ if actual_lrc_lyrics:
+ info_dict['subtitles'] = {
+ 'origin': [{
+ 'ext': 'lrc',
+ 'data': actual_lrc_lyrics,
+ }]
+ }
+ return info_dict
class QQPlaylistBaseIE(InfoExtractor):
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
index c67ad25ce..a16b73ff4 100644
--- a/youtube_dl/extractor/ruutu.py
+++ b/youtube_dl/extractor/ruutu.py
@@ -74,7 +74,7 @@ class RuutuIE(InfoExtractor):
preference = -1 if proto == 'rtmp' else 1
label = child.get('label')
tbr = int_or_none(child.get('bitrate'))
- width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+ width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]]
formats.append({
'format_id': '%s-%s' % (proto, label if label else tbr),
'url': video_url,
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
index f1f43d0a7..744f9db38 100644
--- a/youtube_dl/extractor/tapely.py
+++ b/youtube_dl/extractor/tapely.py
@@ -16,7 +16,7 @@ from ..utils import (
class TapelyIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
+ _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
_API_URL = 'http://tape.ly/showtape?id={0:}'
_S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}'
_SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}'
@@ -42,6 +42,10 @@ class TapelyIE(InfoExtractor):
'ext': 'm4a',
},
},
+ {
+ 'url': 'https://tapely.com/my-grief-as-told-by-water',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
index ef2da5632..649ac9433 100644
--- a/youtube_dl/extractor/videolecturesnet.py
+++ b/youtube_dl/extractor/videolecturesnet.py
@@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urlparse,
+)
from ..utils import (
- find_xpath_attr,
- int_or_none,
+ ExtractorError,
parse_duration,
- unified_strdate,
)
@@ -15,7 +17,7 @@ class VideoLecturesNetIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$'
IE_NAME = 'videolectures.net'
- _TEST = {
+ _TESTS = [{
'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
'info_dict': {
'id': 'promogram_igor_mekjavic_eng',
@@ -26,61 +28,55 @@ class VideoLecturesNetIE(InfoExtractor):
'duration': 565,
'thumbnail': 're:http://.*\.jpg',
},
- }
+ }, {
+ # video with invalid direct format links (HTTP 403)
+ 'url': 'http://videolectures.net/russir2010_filippova_nlp/',
+ 'info_dict': {
+ 'id': 'russir2010_filippova_nlp',
+ 'ext': 'flv',
+ 'title': 'NLP at Google',
+ 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
+ 'duration': 5352,
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://videolectures.net/deeplearning2015_montreal/',
+ 'info_dict': {
+ 'id': 'deeplearning2015_montreal',
+ 'title': 'Deep Learning Summer School, Montreal 2015',
+ 'description': 'md5:90121a40cc6926df1bf04dcd8563ed3b',
+ },
+ 'playlist_count': 30,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
- smil = self._download_xml(smil_url, video_id)
- title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
- description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract')
- description = (
- None if description_el is None
- else description_el.attrib['content'])
- upload_date = unified_strdate(
- find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
+ try:
+ smil = self._download_smil(smil_url, video_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ # Probably a playlist
+ webpage = self._download_webpage(url, video_id)
+ entries = [
+ self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet')
+ for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)]
+ playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True)
+ playlist_description = self._html_search_meta('description', webpage, 'description')
+ return self.playlist_result(entries, video_id, playlist_title, playlist_description)
- switch = smil.find('.//switch')
- duration = parse_duration(switch.attrib.get('dur'))
- thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
- thumbnail = (
- None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
+ info = self._parse_smil(smil, smil_url, video_id)
- formats = []
- for v in switch.findall('./video'):
- proto = v.attrib.get('proto')
- if proto not in ['http', 'rtmp']:
- continue
- f = {
- 'width': int_or_none(v.attrib.get('width')),
- 'height': int_or_none(v.attrib.get('height')),
- 'filesize': int_or_none(v.attrib.get('size')),
- 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
- 'ext': v.attrib.get('ext'),
- }
- src = v.attrib['src']
- if proto == 'http':
- if self._is_valid_url(src, video_id):
- f['url'] = src
- formats.append(f)
- elif proto == 'rtmp':
- f.update({
- 'url': v.attrib['streamer'],
- 'play_path': src,
- 'rtmp_real_time': True,
- })
- formats.append(f)
- self._sort_formats(formats)
+ info['id'] = video_id
+
+ switch = smil.find('.//switch')
+ if switch is not None:
+ info['duration'] = parse_duration(switch.attrib.get('dur'))
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'upload_date': upload_date,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
+ return info
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index c30c5a8e5..765e9e6fd 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -17,6 +17,7 @@ from ..utils import (
unescapeHTML,
unified_strdate,
)
+from .vimeo import VimeoIE
class VKIE(InfoExtractor):
@@ -249,6 +250,10 @@ class VKIE(InfoExtractor):
if youtube_url:
return self.url_result(youtube_url, 'Youtube')
+ vimeo_url = VimeoIE._extract_vimeo_url(url, info_page)
+ if vimeo_url is not None:
+ return self.url_result(vimeo_url)
+
m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
if m_rutube is not None: