aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/theplatform.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/theplatform.py')
-rw-r--r--youtube_dl/extractor/theplatform.py131
1 files changed, 95 insertions, 36 deletions
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 863914299..cfbf7f4e1 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -9,16 +9,19 @@ import hashlib
from .once import OnceIE
+from .adobepass import AdobePassIE
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
+ determine_ext,
ExtractorError,
float_or_none,
int_or_none,
sanitized_Request,
unsmuggle_url,
+ update_url_query,
xpath_with_ns,
mimetype2ext,
find_xpath_attr,
@@ -48,27 +51,32 @@ class ThePlatformBaseIE(OnceIE):
if OnceIE.suitable(_format['url']):
formats.extend(self._extract_once_formats(_format['url']))
else:
- formats.append(_format)
+ media_url = _format['url']
+ if determine_ext(media_url) == 'm3u8':
+ hdnea2 = self._get_cookies(media_url).get('hdnea2')
+ if hdnea2:
+ _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
- self._sort_formats(formats)
+ formats.append(_format)
subtitles = self._parse_smil_subtitles(meta, default_ns)
return formats, subtitles
- def get_metadata(self, path, video_id):
+ def _download_theplatform_metadata(self, path, video_id):
info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
- info = self._download_json(info_url, video_id)
+ return self._download_json(info_url, video_id)
+ def _parse_theplatform_metadata(self, info):
subtitles = {}
captions = info.get('captions')
if isinstance(captions, list):
for caption in captions:
lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
- subtitles[lang] = [{
+ subtitles.setdefault(lang, []).append({
'ext': mimetype2ext(mime),
'url': src,
- }]
+ })
return {
'title': info['title'],
@@ -76,13 +84,19 @@ class ThePlatformBaseIE(OnceIE):
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
'duration': int_or_none(info.get('duration'), 1000),
+ 'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
+ 'uploader': info.get('billingCode'),
}
+ def _extract_theplatform_metadata(self, path, video_id):
+ info = self._download_theplatform_metadata(path, video_id)
+ return self._parse_theplatform_metadata(info)
-class ThePlatformIE(ThePlatformBaseIE):
+
+class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
- (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
+ (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
_TESTS = [{
@@ -94,11 +108,15 @@ class ThePlatformIE(ThePlatformBaseIE):
'title': 'Blackberry\'s big, bold Z30',
'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
'duration': 247,
+ 'timestamp': 1383239700,
+ 'upload_date': '20131031',
+ 'uploader': 'CBSI-NEW',
},
'params': {
# rtmp download
'skip_download': True,
},
+ 'skip': '404 Not Found',
}, {
# from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/
'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT',
@@ -107,6 +125,9 @@ class ThePlatformIE(ThePlatformBaseIE):
'ext': 'flv',
'description': 'md5:ac330c9258c04f9d7512cf26b9595409',
'title': 'Tesla Model S: A second step towards a cleaner motoring future',
+ 'timestamp': 1426176191,
+ 'upload_date': '20150312',
+ 'uploader': 'CBSI-NEW',
},
'params': {
# rtmp download
@@ -119,6 +140,7 @@ class ThePlatformIE(ThePlatformBaseIE):
'ext': 'mp4',
'description': 'md5:644ad9188d655b742f942bf2e06b002d',
'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+ 'uploader': 'EGSM',
}
}, {
'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
@@ -135,6 +157,7 @@ class ThePlatformIE(ThePlatformBaseIE):
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1435752600,
'upload_date': '20150701',
+ 'uploader': 'NBCU-NEWS',
},
}, {
# From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
@@ -143,6 +166,22 @@ class ThePlatformIE(ThePlatformBaseIE):
'only_matching': True,
}]
+ @classmethod
+ def _extract_urls(cls, webpage):
+ m = re.search(
+ r'''(?x)
+ <meta\s+
+ property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+ content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
+ ''', webpage)
+ if m:
+ return [m.group('url')]
+
+ matches = re.findall(
+ r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+ if matches:
+ return list(zip(*matches))[1]
+
@staticmethod
def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
flags = '10' if include_qs else '00'
@@ -151,11 +190,11 @@ class ThePlatformIE(ThePlatformBaseIE):
def str_to_hex(str):
return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
- def hex_to_str(hex):
- return binascii.a2b_hex(hex)
+ def hex_to_bytes(hex):
+ return binascii.a2b_hex(hex.encode('ascii'))
- relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0]
- clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path))
+ relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1)
+ clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
return '%s&sig=%s' % (url, sig)
@@ -170,10 +209,10 @@ class ThePlatformIE(ThePlatformBaseIE):
if not provider_id:
provider_id = 'dJ5BDC'
- path = provider_id
+ path = provider_id + '/'
if mobj.group('media'):
- path += '/media'
- path += '/' + video_id
+ path += mobj.group('media')
+ path += video_id
qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
if 'guid' in qs_dict:
@@ -231,8 +270,9 @@ class ThePlatformIE(ThePlatformBaseIE):
smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
+ self._sort_formats(formats)
- ret = self.get_metadata(path, video_id)
+ ret = self._extract_theplatform_metadata(path, video_id)
combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
ret.update({
'id': video_id,
@@ -244,9 +284,9 @@ class ThePlatformIE(ThePlatformBaseIE):
class ThePlatformFeedIE(ThePlatformBaseIE):
- _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
- _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
- _TEST = {
+ _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
+ _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))'
+ _TESTS = [{
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
'md5': '6e32495b5073ab414471b615c5ded394',
@@ -260,33 +300,40 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
'timestamp': 1391824260,
'duration': 467.0,
'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+ 'uploader': 'NBCU-NEWS',
},
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
- provider_id = mobj.group('provider_id')
- feed_id = mobj.group('feed_id')
+ }]
- real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
- feed = self._download_json(real_url, video_id)
- entry = feed['entries'][0]
+ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
+ real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
+ entry = self._download_json(real_url, video_id)['entries'][0]
formats = []
subtitles = {}
first_video_id = None
duration = None
+ asset_types = []
for item in entry['media$content']:
- smil_url = item['plfile$url'] + '&mbr=true'
+ smil_url = item['plfile$url']
cur_video_id = ThePlatformIE._match_id(smil_url)
if first_video_id is None:
first_video_id = cur_video_id
duration = float_or_none(item.get('plfile$duration'))
- cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
- formats.extend(cur_formats)
- subtitles = self._merge_subtitles(subtitles, cur_subtitles)
+ for asset_type in item['plfile$assetTypes']:
+ if asset_type in asset_types:
+ continue
+ asset_types.append(asset_type)
+ query = {
+ 'mbr': 'true',
+ 'formats': item['plfile$format'],
+ 'assetTypes': asset_type,
+ }
+ if asset_type in asset_types_query:
+ query.update(asset_types_query[asset_type])
+ cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
+ smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+ formats.extend(cur_formats)
+ subtitles = self._merge_subtitles(subtitles, cur_subtitles)
self._sort_formats(formats)
@@ -299,7 +346,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
categories = [item['media$name'] for item in entry.get('media$categories', [])]
- ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+ ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
ret.update({
'id': video_id,
@@ -310,5 +357,17 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
'timestamp': timestamp,
'categories': categories,
})
+ if custom_fields:
+ ret.update(custom_fields(entry))
return ret
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ provider_id = mobj.group('provider_id')
+ feed_id = mobj.group('feed_id')
+ filter_query = mobj.group('filter')
+
+ return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)