aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorTithen-Firion <Tithen-Firion@users.noreply.github.com>2017-05-04 11:00:06 +0200
committerGitHub <noreply@github.com>2017-05-04 11:00:06 +0200
commitc89267d31ad99eb5b1a87cd354de5280a2a087b1 (patch)
tree8bb3b01cd088d0646089344bddd3d4ff272c0065 /youtube_dl
parent7552f96352f35cd877e52fd0770b77ba1856fc62 (diff)
parent0c265486016b06342fb257966474ce591667aaff (diff)
downloadyoutube-dl-c89267d31ad99eb5b1a87cd354de5280a2a087b1.tar.xz
Merge branch 'master' into openload-phantomjs-method
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py14
-rw-r--r--youtube_dl/__init__.py1
-rw-r--r--youtube_dl/downloader/common.py34
-rw-r--r--youtube_dl/downloader/dash.py43
-rw-r--r--youtube_dl/downloader/external.py12
-rw-r--r--youtube_dl/downloader/f4m.py33
-rw-r--r--youtube_dl/downloader/fragment.py122
-rw-r--r--youtube_dl/downloader/hls.py34
-rw-r--r--youtube_dl/downloader/ism.py34
-rw-r--r--youtube_dl/extractor/adobepass.py6
-rw-r--r--youtube_dl/extractor/aenetworks.py14
-rw-r--r--youtube_dl/extractor/afreecatv.py3
-rw-r--r--youtube_dl/extractor/amp.py26
-rw-r--r--youtube_dl/extractor/anvato.py66
-rw-r--r--youtube_dl/extractor/appleconnect.py4
-rw-r--r--youtube_dl/extractor/appletrailers.py5
-rw-r--r--youtube_dl/extractor/archiveorg.py4
-rw-r--r--youtube_dl/extractor/arte.py5
-rw-r--r--youtube_dl/extractor/atresplayer.py2
-rw-r--r--youtube_dl/extractor/audioboom.py2
-rw-r--r--youtube_dl/extractor/azubu.py140
-rw-r--r--youtube_dl/extractor/bandcamp.py8
-rw-r--r--youtube_dl/extractor/beeg.py2
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/br.py2
-rw-r--r--youtube_dl/extractor/brightcove.py26
-rw-r--r--youtube_dl/extractor/canalc2.py5
-rw-r--r--youtube_dl/extractor/cbc.py6
-rw-r--r--youtube_dl/extractor/cbslocal.py4
-rwxr-xr-xyoutube_dl/extractor/cda.py52
-rw-r--r--youtube_dl/extractor/clipfish.py2
-rw-r--r--youtube_dl/extractor/collegerama.py3
-rw-r--r--youtube_dl/extractor/common.py213
-rw-r--r--youtube_dl/extractor/coub.py5
-rw-r--r--youtube_dl/extractor/crunchyroll.py4
-rw-r--r--youtube_dl/extractor/dailymotion.py21
-rw-r--r--youtube_dl/extractor/democracynow.py3
-rw-r--r--youtube_dl/extractor/dotsub.py2
-rw-r--r--youtube_dl/extractor/douyutv.py4
-rw-r--r--youtube_dl/extractor/extractors.py10
-rw-r--r--youtube_dl/extractor/foxsports.py9
-rw-r--r--youtube_dl/extractor/funnyordie.py3
-rw-r--r--youtube_dl/extractor/gamespot.py3
-rw-r--r--youtube_dl/extractor/generic.py86
-rw-r--r--youtube_dl/extractor/go.py49
-rw-r--r--youtube_dl/extractor/go90.py38
-rw-r--r--youtube_dl/extractor/infoq.py4
-rw-r--r--youtube_dl/extractor/instagram.py8
-rw-r--r--youtube_dl/extractor/iqiyi.py26
-rw-r--r--youtube_dl/extractor/itv.py28
-rw-r--r--youtube_dl/extractor/leeco.py111
-rw-r--r--youtube_dl/extractor/lego.py2
-rw-r--r--youtube_dl/extractor/limelight.py37
-rw-r--r--youtube_dl/extractor/noovo.py97
-rw-r--r--youtube_dl/extractor/nowness.py2
-rw-r--r--youtube_dl/extractor/odnoklassniki.py32
-rw-r--r--youtube_dl/extractor/pbs.py19
-rw-r--r--youtube_dl/extractor/porn91.py32
-rw-r--r--youtube_dl/extractor/r7.py3
-rw-r--r--youtube_dl/extractor/streamable.py6
-rw-r--r--youtube_dl/extractor/streamango.py64
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/tvp.py3
-rw-r--r--youtube_dl/extractor/tvplayer.py35
-rw-r--r--youtube_dl/extractor/vevo.py17
-rw-r--r--youtube_dl/extractor/videopress.py9
-rw-r--r--youtube_dl/extractor/vidio.py7
-rw-r--r--youtube_dl/extractor/vidzi.py11
-rw-r--r--youtube_dl/extractor/viewster.py3
-rw-r--r--youtube_dl/extractor/washingtonpost.py6
-rw-r--r--youtube_dl/extractor/wsj.py52
-rw-r--r--youtube_dl/extractor/xfileshare.py28
-rw-r--r--youtube_dl/extractor/xtube.py23
-rw-r--r--youtube_dl/extractor/xvideos.py11
-rw-r--r--youtube_dl/extractor/yahoo.py2
-rw-r--r--youtube_dl/extractor/yandexmusic.py3
-rw-r--r--youtube_dl/extractor/youtube.py5
-rw-r--r--youtube_dl/extractor/zaq1.py101
-rw-r--r--youtube_dl/options.py4
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py31
-rw-r--r--youtube_dl/socks.py5
-rw-r--r--youtube_dl/utils.py210
-rw-r--r--youtube_dl/version.py2
83 files changed, 1536 insertions, 679 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 7953670a7..eb465c425 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -640,7 +640,7 @@ class YoutubeDL(object):
NUMERIC_FIELDS = set((
'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
- 'upload_year', 'upload_month', 'upload_day',
+ 'timestamp', 'upload_year', 'upload_month', 'upload_day',
'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
'average_rating', 'comment_count', 'age_limit',
'start_time', 'end_time',
@@ -672,8 +672,7 @@ class YoutubeDL(object):
FORMAT_RE.format(numeric_field),
r'%({0})s'.format(numeric_field), outtmpl)
- tmpl = expand_path(outtmpl)
- filename = tmpl % template_dict
+ filename = expand_path(outtmpl % template_dict)
# Temporary fix for #4787
# 'Treat' all problem characters by passing filename through preferredencoding
# to workaround encoding issues with subprocess on python2 @ Windows
@@ -851,7 +850,14 @@ class YoutubeDL(object):
new_result = info.copy()
new_result.update(force_properties)
- assert new_result.get('_type') != 'url_transparent'
+ # Extracted info may not be a video result (i.e.
+ # info.get('_type', 'video') != video) but rather an url or
+ # url_transparent. In such cases outer metadata (from ie_result)
+ # should be propagated to inner one (info). For this to happen
+ # _type of info should be overridden with url_transparent. This
+ # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
+ if new_result.get('_type') == 'url':
+ new_result['_type'] = 'url_transparent'
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index f15606568..c4589411e 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -343,6 +343,7 @@ def _real_main(argv=None):
'retries': opts.retries,
'fragment_retries': opts.fragment_retries,
'skip_unavailable_fragments': opts.skip_unavailable_fragments,
+ 'keep_fragments': opts.keep_fragments,
'buffersize': opts.buffersize,
'noresizebuffer': opts.noresizebuffer,
'continuedl': opts.continue_dl,
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 2c4470a95..5d6621147 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -187,6 +187,9 @@ class FileDownloader(object):
return filename[:-len('.part')]
return filename
+ def ytdl_filename(self, filename):
+ return filename + '.ytdl'
+
def try_rename(self, old_filename, new_filename):
try:
if old_filename == new_filename:
@@ -327,21 +330,22 @@ class FileDownloader(object):
os.path.exists(encodeFilename(filename))
)
- continuedl_and_exists = (
- self.params.get('continuedl', True) and
- os.path.isfile(encodeFilename(filename)) and
- not self.params.get('nopart', False)
- )
-
- # Check file already present
- if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
- self.report_file_already_downloaded(filename)
- self._hook_progress({
- 'filename': filename,
- 'status': 'finished',
- 'total_bytes': os.path.getsize(encodeFilename(filename)),
- })
- return True
+ if not hasattr(filename, 'write'):
+ continuedl_and_exists = (
+ self.params.get('continuedl', True) and
+ os.path.isfile(encodeFilename(filename)) and
+ not self.params.get('nopart', False)
+ )
+
+ # Check file already present
+ if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
+ self.report_file_already_downloaded(filename)
+ self._hook_progress({
+ 'filename': filename,
+ 'status': 'finished',
+ 'total_bytes': os.path.getsize(encodeFilename(filename)),
+ })
+ return True
min_sleep_interval = self.params.get('sleep_interval')
if min_sleep_interval:
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
index e2ddc369e..7491fdad8 100644
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@@ -1,13 +1,7 @@
from __future__ import unicode_literals
-import os
-
from .fragment import FragmentFD
from ..compat import compat_urllib_error
-from ..utils import (
- sanitize_open,
- encodeFilename,
-)
class DashSegmentsFD(FragmentFD):
@@ -28,31 +22,24 @@ class DashSegmentsFD(FragmentFD):
self._prepare_and_start_frag_download(ctx)
- segments_filenames = []
-
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
- def process_segment(segment, tmp_filename, num):
- segment_url = segment['url']
- segment_name = 'Frag%d' % num
- target_filename = '%s-%s' % (tmp_filename, segment_name)
+ frag_index = 0
+ for i, segment in enumerate(segments):
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
# In DASH, the first segment contains necessary headers to
# generate a valid MP4 file, so always abort for the first segment
- fatal = num == 0 or not skip_unavailable_fragments
+ fatal = i == 0 or not skip_unavailable_fragments
count = 0
while count <= fragment_retries:
try:
- success = ctx['dl'].download(target_filename, {
- 'url': segment_url,
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
- down, target_sanitized = sanitize_open(target_filename, 'rb')
- ctx['dest_stream'].write(down.read())
- down.close()
- segments_filenames.append(target_sanitized)
+ self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
# YouTube may often return 404 HTTP error for a fragment causing the
@@ -63,22 +50,14 @@ class DashSegmentsFD(FragmentFD):
# HTTP error.
count += 1
if count <= fragment_retries:
- self.report_retry_fragment(err, segment_name, count, fragment_retries)
+ self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if not fatal:
- self.report_skip_fragment(segment_name)
- return True
+ self.report_skip_fragment(frag_index)
+ continue
self.report_error('giving up after %s fragment retries' % fragment_retries)
return False
- return True
-
- for i, segment in enumerate(segments):
- if not process_segment(segment, ctx['tmpfilename'], i):
- return False
self._finish_frag_download(ctx)
- for segment_file in segments_filenames:
- os.remove(encodeFilename(segment_file))
-
return True
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py
index e13cf547d..e78169a0d 100644
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@@ -29,7 +29,17 @@ class ExternalFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
- retval = self._call_downloader(tmpfilename, info_dict)
+ try:
+ retval = self._call_downloader(tmpfilename, info_dict)
+ except KeyboardInterrupt:
+ if not info_dict.get('is_live'):
+ raise
+ # Live stream downloading cancellation should be considered as
+ # correct and expected termination thus all postprocessing
+ # should take place
+ retval = 0
+ self.to_screen('[%s] Interrupted by user' % self.get_basename())
+
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 688e086eb..c8fde9a89 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -3,7 +3,6 @@ from __future__ import division, unicode_literals
import base64
import io
import itertools
-import os
import time
from .fragment import FragmentFD
@@ -16,9 +15,7 @@ from ..compat import (
compat_struct_unpack,
)
from ..utils import (
- encodeFilename,
fix_xml_ampersands,
- sanitize_open,
xpath_text,
)
@@ -366,17 +363,21 @@ class F4mFD(FragmentFD):
dest_stream = ctx['dest_stream']
- write_flv_header(dest_stream)
- if not live:
- write_metadata_tag(dest_stream, metadata)
+ if ctx['complete_frags_downloaded_bytes'] == 0:
+ write_flv_header(dest_stream)
+ if not live:
+ write_metadata_tag(dest_stream, metadata)
base_url_parsed = compat_urllib_parse_urlparse(base_url)
self._start_frag_download(ctx)
- frags_filenames = []
+ frag_index = 0
while fragments_list:
seg_i, frag_i = fragments_list.pop(0)
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
name = 'Seg%d-Frag%d' % (seg_i, frag_i)
query = []
if base_url_parsed.query:
@@ -386,17 +387,10 @@ class F4mFD(FragmentFD):
if info_dict.get('extra_param_to_segment_url'):
query.append(info_dict['extra_param_to_segment_url'])
url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
- frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
try:
- success = ctx['dl'].download(frag_filename, {
- 'url': url_parsed.geturl(),
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
if not success:
return False
- (down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
- down_data = down.read()
- down.close()
reader = FlvReader(down_data)
while True:
try:
@@ -411,12 +405,8 @@ class F4mFD(FragmentFD):
break
raise
if box_type == b'mdat':
- dest_stream.write(box_data)
+ self._append_fragment(ctx, box_data)
break
- if live:
- os.remove(encodeFilename(frag_sanitized))
- else:
- frags_filenames.append(frag_sanitized)
except (compat_urllib_error.HTTPError, ) as err:
if live and (err.code == 404 or err.code == 410):
# We didn't keep up with the live window. Continue
@@ -436,7 +426,4 @@ class F4mFD(FragmentFD):
self._finish_frag_download(ctx)
- for frag_file in frags_filenames:
- os.remove(encodeFilename(frag_file))
-
return True
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
index 56f975266..bccc8ecc1 100644
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@@ -2,6 +2,7 @@ from __future__ import division, unicode_literals
import os
import time
+import json
from .common import FileDownloader
from .http import HttpFD
@@ -28,15 +29,37 @@ class FragmentFD(FileDownloader):
and hlsnative only)
skip_unavailable_fragments:
Skip unavailable fragments (DASH and hlsnative only)
+ keep_fragments: Keep downloaded fragments on disk after downloading is
+ finished
+
+ For each incomplete fragment download youtube-dl keeps on disk a special
+ bookkeeping file with download state and metadata (in future such files will
+ be used for any incomplete download handled by youtube-dl). This file is
+ used to properly handle resuming, check download file consistency and detect
+ potential errors. The file has a .ytdl extension and represents a standard
+ JSON file of the following format:
+
+ extractor:
+ Dictionary of extractor related data. TBD.
+
+ downloader:
+ Dictionary of downloader related data. May contain following data:
+ current_fragment:
+ Dictionary with current (being downloaded) fragment data:
+ index: 0-based index of current fragment among all fragments
+ fragment_count:
+ Total count of fragments
+
+ This feature is experimental and file format may change in future.
"""
- def report_retry_fragment(self, err, fragment_name, count, retries):
+ def report_retry_fragment(self, err, frag_index, count, retries):
self.to_screen(
- '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...'
- % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries)))
+ '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...'
+ % (error_to_compat_str(err), frag_index, count, self.format_retries(retries)))
- def report_skip_fragment(self, fragment_name):
- self.to_screen('[download] Skipping fragment %s...' % fragment_name)
+ def report_skip_fragment(self, frag_index):
+ self.to_screen('[download] Skipping fragment %d...' % frag_index)
def _prepare_url(self, info_dict, url):
headers = info_dict.get('http_headers')
@@ -46,6 +69,51 @@ class FragmentFD(FileDownloader):
self._prepare_frag_download(ctx)
self._start_frag_download(ctx)
+ @staticmethod
+ def __do_ytdl_file(ctx):
+ return not ctx['live'] and not ctx['tmpfilename'] == '-'
+
+ def _read_ytdl_file(self, ctx):
+ stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
+ ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
+ stream.close()
+
+ def _write_ytdl_file(self, ctx):
+ frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w')
+ downloader = {
+ 'current_fragment': {
+ 'index': ctx['fragment_index'],
+ },
+ }
+ if ctx.get('fragment_count') is not None:
+ downloader['fragment_count'] = ctx['fragment_count']
+ frag_index_stream.write(json.dumps({'downloader': downloader}))
+ frag_index_stream.close()
+
+ def _download_fragment(self, ctx, frag_url, info_dict, headers=None):
+ fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index'])
+ success = ctx['dl'].download(fragment_filename, {
+ 'url': frag_url,
+ 'http_headers': headers or info_dict.get('http_headers'),
+ })
+ if not success:
+ return False, None
+ down, frag_sanitized = sanitize_open(fragment_filename, 'rb')
+ ctx['fragment_filename_sanitized'] = frag_sanitized
+ frag_content = down.read()
+ down.close()
+ return True, frag_content
+
+ def _append_fragment(self, ctx, frag_content):
+ try:
+ ctx['dest_stream'].write(frag_content)
+ finally:
+ if self.__do_ytdl_file(ctx):
+ self._write_ytdl_file(ctx)
+ if not self.params.get('keep_fragments', False):
+ os.remove(ctx['fragment_filename_sanitized'])
+ del ctx['fragment_filename_sanitized']
+
def _prepare_frag_download(self, ctx):
if 'live' not in ctx:
ctx['live'] = False
@@ -66,11 +134,36 @@ class FragmentFD(FileDownloader):
}
)
tmpfilename = self.temp_name(ctx['filename'])
- dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb')
+ open_mode = 'wb'
+ resume_len = 0
+
+ # Establish possible resume length
+ if os.path.isfile(encodeFilename(tmpfilename)):
+ open_mode = 'ab'
+ resume_len = os.path.getsize(encodeFilename(tmpfilename))
+
+ # Should be initialized before ytdl file check
+ ctx.update({
+ 'tmpfilename': tmpfilename,
+ 'fragment_index': 0,
+ })
+
+ if self.__do_ytdl_file(ctx):
+ if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))):
+ self._read_ytdl_file(ctx)
+ else:
+ self._write_ytdl_file(ctx)
+ if ctx['fragment_index'] > 0:
+ assert resume_len > 0
+
+ dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode)
+
ctx.update({
'dl': dl,
'dest_stream': dest_stream,
'tmpfilename': tmpfilename,
+ # Total complete fragments downloaded so far in bytes
+ 'complete_frags_downloaded_bytes': resume_len,
})
def _start_frag_download(self, ctx):
@@ -79,9 +172,9 @@ class FragmentFD(FileDownloader):
# hook
state = {
'status': 'downloading',
- 'downloaded_bytes': 0,
- 'frag_index': 0,
- 'frag_count': total_frags,
+ 'downloaded_bytes': ctx['complete_frags_downloaded_bytes'],
+ 'fragment_index': ctx['fragment_index'],
+ 'fragment_count': total_frags,
'filename': ctx['filename'],
'tmpfilename': ctx['tmpfilename'],
}
@@ -89,8 +182,6 @@ class FragmentFD(FileDownloader):
start = time.time()
ctx.update({
'started': start,
- # Total complete fragments downloaded so far in bytes
- 'complete_frags_downloaded_bytes': 0,
# Amount of fragment's bytes downloaded by the time of the previous
# frag progress hook invocation
'prev_frag_downloaded_bytes': 0,
@@ -106,11 +197,12 @@ class FragmentFD(FileDownloader):
if not ctx['live']:
estimated_size = (
(ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) /
- (state['frag_index'] + 1) * total_frags)
+ (state['fragment_index'] + 1) * total_frags)
state['total_bytes_estimate'] = estimated_size
if s['status'] == 'finished':
- state['frag_index'] += 1
+ state['fragment_index'] += 1
+ ctx['fragment_index'] = state['fragment_index']
state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
ctx['prev_frag_downloaded_bytes'] = 0
@@ -132,6 +224,10 @@ class FragmentFD(FileDownloader):
def _finish_frag_download(self, ctx):
ctx['dest_stream'].close()
+ if self.__do_ytdl_file(ctx):
+ ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename']))
+ if os.path.isfile(ytdl_filename):
+ os.remove(ytdl_filename)
elapsed = time.time() - ctx['started']
self.try_rename(ctx['tmpfilename'], ctx['filename'])
fsize = os.path.getsize(encodeFilename(ctx['filename']))
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index d0a5f7ba4..0e29c8a2a 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import os.path
import re
import binascii
try:
@@ -18,8 +17,6 @@ from ..compat import (
compat_struct_pack,
)
from ..utils import (
- encodeFilename,
- sanitize_open,
parse_m3u8_attributes,
update_url_query,
)
@@ -103,17 +100,18 @@ class HlsFD(FragmentFD):
media_sequence = 0
decrypt_info = {'METHOD': 'NONE'}
byte_range = {}
- frags_filenames = []
+ frag_index = 0
for line in s.splitlines():
line = line.strip()
if line:
if not line.startswith('#'):
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
frag_url = (
line
if re.match(r'^https?://', line)
else compat_urlparse.urljoin(man_url, line))
- frag_name = 'Frag%d' % i
- frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
count = 0
@@ -122,15 +120,10 @@ class HlsFD(FragmentFD):
headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'])
while count <= fragment_retries:
try:
- success = ctx['dl'].download(frag_filename, {
- 'url': frag_url,
- 'http_headers': headers,
- })
+ success, frag_content = self._download_fragment(
+ ctx, frag_url, info_dict, headers)
if not success:
return False
- down, frag_sanitized = sanitize_open(frag_filename, 'rb')
- frag_content = down.read()
- down.close()
break
except compat_urllib_error.HTTPError as err:
# Unavailable (possibly temporary) fragments may be served.
@@ -139,28 +132,29 @@ class HlsFD(FragmentFD):
# https://github.com/rg3/youtube-dl/issues/10448).
count += 1
if count <= fragment_retries:
- self.report_retry_fragment(err, frag_name, count, fragment_retries)
+ self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if skip_unavailable_fragments:
i += 1
media_sequence += 1
- self.report_skip_fragment(frag_name)
+ self.report_skip_fragment(frag_index)
continue
self.report_error(
'giving up after %s fragment retries' % fragment_retries)
return False
if decrypt_info['METHOD'] == 'AES-128':
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
+ decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read()
frag_content = AES.new(
decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
- ctx['dest_stream'].write(frag_content)
- frags_filenames.append(frag_sanitized)
+ self._append_fragment(ctx, frag_content)
# We only download the first fragment during the test
if test:
break
i += 1
media_sequence += 1
elif line.startswith('#EXT-X-KEY'):
+ decrypt_url = decrypt_info.get('URI')
decrypt_info = parse_m3u8_attributes(line[11:])
if decrypt_info['METHOD'] == 'AES-128':
if 'IV' in decrypt_info:
@@ -170,7 +164,8 @@ class HlsFD(FragmentFD):
man_url, decrypt_info['URI'])
if extra_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
- decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
+ if decrypt_url != decrypt_info['URI']:
+ decrypt_info['KEY'] = None
elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
media_sequence = int(line[22:])
elif line.startswith('#EXT-X-BYTERANGE'):
@@ -183,7 +178,4 @@ class HlsFD(FragmentFD):
self._finish_frag_download(ctx)
- for frag_file in frags_filenames:
- os.remove(encodeFilename(frag_file))
-
return True
diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py
index 63a636cb7..5f6f9faef 100644
--- a/youtube_dl/downloader/ism.py
+++ b/youtube_dl/downloader/ism.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import os
import time
import struct
import binascii
@@ -8,10 +7,6 @@ import io
from .fragment import FragmentFD
from ..compat import compat_urllib_error
-from ..utils import (
- sanitize_open,
- encodeFilename,
-)
u8 = struct.Struct(b'>B')
@@ -225,50 +220,39 @@ class IsmFD(FragmentFD):
self._prepare_and_start_frag_download(ctx)
- segments_filenames = []
-
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
track_written = False
+ frag_index = 0
for i, segment in enumerate(segments):
- segment_url = segment['url']
- segment_name = 'Frag%d' % i
- target_filename = '%s-%s' % (ctx['tmpfilename'], segment_name)
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
count = 0
while count <= fragment_retries:
try:
- success = ctx['dl'].download(target_filename, {
- 'url': segment_url,
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
- down, target_sanitized = sanitize_open(target_filename, 'rb')
- down_data = down.read()
if not track_written:
- tfhd_data = extract_box_data(down_data, [b'moof', b'traf', b'tfhd'])
+ tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
track_written = True
- ctx['dest_stream'].write(down_data)
- down.close()
- segments_filenames.append(target_sanitized)
+ self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
count += 1
if count <= fragment_retries:
- self.report_retry_fragment(err, segment_name, count, fragment_retries)
+ self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if skip_unavailable_fragments:
- self.report_skip_fragment(segment_name)
+ self.report_skip_fragment(frag_index)
continue
self.report_error('giving up after %s fragment retries' % fragment_retries)
return False
self._finish_frag_download(ctx)
- for segment_file in segments_filenames:
- os.remove(encodeFilename(segment_file))
-
return True
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py
index 100cf997f..7da96c65c 100644
--- a/youtube_dl/extractor/adobepass.py
+++ b/youtube_dl/extractor/adobepass.py
@@ -1308,6 +1308,12 @@ class AdobePassIE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MVPD_CACHE = 'ap-mvpd'
+ def _download_webpage_handle(self, *args, **kwargs):
+ headers = kwargs.get('headers', {})
+ headers.update(self.geo_verification_headers())
+ kwargs['headers'] = headers
+ return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs)
+
@staticmethod
def _get_mvpd_resource(provider_id, title, guid, rating):
channel = etree.Element('channel')
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index c01c67303..2dcdba9d2 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -101,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE):
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
entries.append(self.url_result(
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
- return self.playlist_result(
- entries, self._html_search_meta('aetn:SeriesId', webpage),
- self._html_search_meta('aetn:SeriesTitle', webpage))
- elif url_parts_len == 2:
+ if entries:
+ return self.playlist_result(
+ entries, self._html_search_meta('aetn:SeriesId', webpage),
+ self._html_search_meta('aetn:SeriesTitle', webpage))
+ else:
+ # single season
+ url_parts_len = 2
+ if url_parts_len == 2:
entries = []
for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
episode_attributes = extract_attributes(episode_item)
@@ -112,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE):
url, episode_attributes['data-canonical'])
entries.append(self.url_result(
episode_url, 'AENetworks',
- episode_attributes['data-videoid']))
+ episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
return self.playlist_result(
entries, self._html_search_meta('aetn:SeasonId', webpage))
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
index 78d29c861..c8cb91dcb 100644
--- a/youtube_dl/extractor/afreecatv.py
+++ b/youtube_dl/extractor/afreecatv.py
@@ -207,11 +207,10 @@ class AfreecaTVIE(InfoExtractor):
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls',
note='Downloading part %d m3u8 information' % file_num)
- title = title if one else '%s (part %d)' % (title, file_num)
file_info = common_entry.copy()
file_info.update({
'id': format_id,
- 'title': title,
+ 'title': title if one else '%s (part %d)' % (title, file_num),
'upload_date': upload_date,
'duration': file_duration,
'formats': formats,
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index e8e40126b..fde1a8ff7 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -7,15 +7,19 @@ from ..utils import (
parse_iso8601,
mimetype2ext,
determine_ext,
+ ExtractorError,
)
class AMPIE(InfoExtractor):
# parse Akamai Adaptive Media Player feed
def _extract_feed_info(self, url):
- item = self._download_json(
+ feed = self._download_json(
url, None, 'Downloading Akamai AMP feed',
- 'Unable to download Akamai AMP feed')['channel']['item']
+ 'Unable to download Akamai AMP feed')
+ item = feed.get('channel', {}).get('item')
+ if not item:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
video_id = item['guid']
@@ -30,9 +34,12 @@ class AMPIE(InfoExtractor):
if isinstance(media_thumbnail, dict):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
- thumbnail = thumbnail_data['@attributes']
+ thumbnail = thumbnail_data.get('@attributes', {})
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
thumbnails.append({
- 'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+ 'url': self._proto_relative_url(thumbnail_url, 'http:'),
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
@@ -43,9 +50,14 @@ class AMPIE(InfoExtractor):
if isinstance(media_subtitle, dict):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
- subtitle = subtitle_data['@attributes']
- lang = subtitle.get('lang') or 'en'
- subtitles[lang] = [{'url': subtitle['href']}]
+ subtitle = subtitle_data.get('@attributes', {})
+ subtitle_href = subtitle.get('href')
+ if not subtitle_href:
+ continue
+ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
+ 'url': subtitle_href,
+ 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
+ })
formats = []
media_content = get_media_node('content')
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py
index 623f44dce..8023da702 100644
--- a/youtube_dl/extractor/anvato.py
+++ b/youtube_dl/extractor/anvato.py
@@ -5,6 +5,7 @@ import base64
import hashlib
import json
import random
+import re
import time
from .common import InfoExtractor
@@ -16,6 +17,7 @@ from ..utils import (
intlist_to_bytes,
int_or_none,
strip_jsonp,
+ unescapeHTML,
)
@@ -26,6 +28,8 @@ def md5_text(s):
class AnvatoIE(InfoExtractor):
+ _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
}
+ _MCP_TO_ACCESS_KEY_TABLE = {
+ 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+ 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+ 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+ 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+ 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+ 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+ 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+ 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+ }
+
+ _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
def __init__(self, *args, **kwargs):
@@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor):
}
if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'):
- # Not using _extract_m3u8_formats here as individual media
- # playlists are also included in published_urls.
- if tbr is None:
- formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
- continue
- else:
+ if tbr is not None:
a_format.update({
'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
'ext': 'mp4',
@@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor):
'subtitles': subtitles,
}
+ @staticmethod
+ def _extract_urls(ie, webpage, video_id):
+ entries = []
+ for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+ anvplayer_data = ie._parse_json(
+ mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+ fatal=False)
+ if not anvplayer_data:
+ continue
+ video = anvplayer_data.get('video')
+ if not isinstance(video, compat_str) or not video.isdigit():
+ continue
+ access_key = anvplayer_data.get('accessKey')
+ if not access_key:
+ mcp = anvplayer_data.get('mcp')
+ if mcp:
+ access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+ mcp.lower())
+ if not access_key:
+ continue
+ entries.append(ie.url_result(
+ 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+ video_id=video))
+ return entries
+
def _extract_anvato_videos(self, webpage, video_id):
- anvplayer_data = self._parse_json(self._html_search_regex(
- r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
- 'Anvato player data'), video_id)
+ anvplayer_data = self._parse_json(
+ self._html_search_regex(
+ self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+ video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+ if access_key not in self._ANVACK_TABLE:
+ access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+ return self._get_anvato_videos(access_key, video_id)
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
index ea7a70393..a84b8b1eb 100644
--- a/youtube_dl/extractor/appleconnect.py
+++ b/youtube_dl/extractor/appleconnect.py
@@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
_TEST = {
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
- 'md5': '10d0f2799111df4cb1c924520ca78f98',
+ 'md5': 'e7c38568a01ea45402570e6029206723',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
'title': 'Energy',
'uploader': 'Drake',
- 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20150710',
'timestamp': 1436545535,
},
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index a6801f3d4..b45b431e1 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor):
}, {
'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
'info_dict': {
- 'id': 'blackthorn',
+ 'id': '4489',
+ 'title': 'Blackthorn',
},
'playlist_mincount': 2,
'expected_warnings': ['Unable to download JSON metadata'],
@@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor):
'title': 'Most Popular',
'id': 'mostpopular',
},
- 'playlist_mincount': 80,
+ 'playlist_mincount': 30,
}, {
'url': 'http://trailers.apple.com/#section=moviestudios',
'info_dict': {
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index e21045bed..3c7d7250b 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor):
}
}, {
'url': 'https://archive.org/details/Cops1922',
- 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',
+ 'md5': '0869000b4ce265e8ca62738b336b268a',
'info_dict': {
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
- 'description': 'md5:b4544662605877edd99df22f9620d858',
+ 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
}
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 69a23e88c..56baef29d 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -180,7 +180,7 @@ class ArteTVBaseIE(InfoExtractor):
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
@@ -188,6 +188,9 @@ class ArteTVPlus7IE(ArteTVBaseIE):
}, {
'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
'only_matching': True,
+ }, {
+ 'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn',
+ 'only_matching': True,
}]
@classmethod
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index 99af6dc5a..01fa308ff 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor):
},
{
'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html',
- 'md5': '0d0e918533bbd4b263f2de4d197d4aac',
+ 'md5': '6e52cbb513c405e403dbacb7aacf8747',
'info_dict': {
'id': 'capitulo-112-david-bustamante',
'ext': 'flv',
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
index 8fc5f65c6..e48bb8972 100644
--- a/youtube_dl/extractor/audioboom.py
+++ b/youtube_dl/extractor/audioboom.py
@@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor):
'title': '3/09/2016 Czaban Hour 3',
'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
'duration': 2245.72,
- 'uploader': 'Steve Czaban',
+ 'uploader': 'SB Nation A.M.',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
}
}, {
diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py
deleted file mode 100644
index 3ba2f00d3..000000000
--- a/youtube_dl/extractor/azubu.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
- sanitized_Request,
-)
-
-
-class AzubuIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/[^/]+#!/play/(?P<id>\d+)'
- _TESTS = [
- {
- 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1',
- 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4',
- 'info_dict': {
- 'id': '15575',
- 'ext': 'mp4',
- 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1',
- 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'timestamp': 1417523507.334,
- 'upload_date': '20141202',
- 'duration': 9988.7,
- 'uploader': 'GSL',
- 'uploader_id': 414310,
- 'view_count': int,
- },
- },
- {
- 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-',
- 'md5': 'b72a871fe1d9f70bd7673769cdb3b925',
- 'info_dict': {
- 'id': '9344',
- 'ext': 'mp4',
- 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"',
- 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'timestamp': 1410530893.320,
- 'upload_date': '20140912',
- 'duration': 172.385,
- 'uploader': 'FnaticTV',
- 'uploader_id': 272749,
- 'view_count': int,
- },
- 'skip': 'Channel offline',
- },
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- data = self._download_json(
- 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
-
- title = data['title'].strip()
- description = data.get('description')
- thumbnail = data.get('thumbnail')
- view_count = data.get('view_count')
- user = data.get('user', {})
- uploader = user.get('username')
- uploader_id = user.get('id')
-
- stream_params = json.loads(data['stream_params'])
-
- timestamp = float_or_none(stream_params.get('creationDate'), 1000)
- duration = float_or_none(stream_params.get('length'), 1000)
-
- renditions = stream_params.get('renditions') or []
- video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
- if video:
- renditions.append(video)
-
- if not renditions and not user.get('channel', {}).get('is_live', True):
- raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True)
-
- formats = [{
- 'url': fmt['url'],
- 'width': fmt['frameWidth'],
- 'height': fmt['frameHeight'],
- 'vbr': float_or_none(fmt['encodingRate'], 1000),
- 'filesize': fmt['size'],
- 'vcodec': fmt['videoCodec'],
- 'container': fmt['videoContainer'],
- } for fmt in renditions if fmt['url']]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'view_count': view_count,
- 'formats': formats,
- }
-
-
-class AzubuLiveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/(?P<id>[^/]+)$'
-
- _TESTS = [{
- 'url': 'http://www.azubu.tv/MarsTVMDLen',
- 'only_matching': True,
- }, {
- 'url': 'http://azubu.uol.com.br/adolfz',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- user = self._match_id(url)
-
- info = self._download_json(
- 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user),
- user)['data']
- if info['type'] != 'STREAM':
- raise ExtractorError('{0} is not streaming live'.format(user), expected=True)
-
- req = sanitized_Request(
- 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id'])
- req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV')
- bc_info = self._download_json(req, user)
- m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS')
- formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4')
- self._sort_formats(formats)
-
- return {
- 'id': info['id'],
- 'title': self._live_title(info['title']),
- 'uploader_id': user,
- 'formats': formats,
- 'is_live': True,
- 'thumbnail': bc_info['poster'],
- }
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 056e06376..df2972f26 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor):
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '73d0b3171568232574e45652f8720b5c',
+ 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
'info_dict': {
'id': '2650410135',
- 'ext': 'mp3',
- 'title': 'Lanius (Battle)',
- 'uploader': 'Ben Prunty Music',
+ 'ext': 'aiff',
+ 'title': 'Ben Prunty - Lanius (Battle)',
+ 'uploader': 'Ben Prunty',
},
}]
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index b0b7914d8..d5c5822f2 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -16,7 +16,7 @@ class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
_TEST = {
'url': 'http://beeg.com/5416503',
- 'md5': '46c384def73b33dbc581262e5ee67cef',
+ 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 7a8e1f60b..e829974ff 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor):
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
- 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20',
+ 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
'uploader_id': 6466954,
'upload_date': '20151011',
},
@@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'md5': '8c2c12e3af7805152675446c905d159b',
+ 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index ff0aa11b1..2c32b6ae2 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -77,7 +77,7 @@ class BRIE(InfoExtractor):
'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
'duration': 893,
'uploader': 'Eva Maria Steimle',
- 'upload_date': '20140117',
+ 'upload_date': '20170208',
}
},
]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 124497e95..3f017a2b1 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -131,6 +131,12 @@ class BrightcoveLegacyIE(InfoExtractor):
},
'playlist_mincount': 10,
},
+ {
+ # playerID inferred from bcpid
+ # from http://www.un.org/chinese/News/story.asp?NewsID=27724
+ 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
+ 'only_matching': True, # Tested in GenericIE
+ }
]
FLV_VCODECS = {
1: 'SORENSON',
@@ -266,9 +272,13 @@ class BrightcoveLegacyIE(InfoExtractor):
if matches:
return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
- return list(filter(None, [
- cls._build_brighcove_url_from_js(custom_bc)
- for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
+ matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
+ if matches:
+ return list(filter(None, [
+ cls._build_brighcove_url_from_js(custom_bc)
+ for custom_bc in matches]))
+ return [src for _, src in re.findall(
+ r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -285,6 +295,10 @@ class BrightcoveLegacyIE(InfoExtractor):
if videoPlayer:
# We set the original url as the default 'Referer' header
referer = smuggled_data.get('Referer', url)
+ if 'playerID' not in query:
+ mobj = re.search(r'/bcpid(\d+)', url)
+ if mobj is not None:
+ query['playerID'] = [mobj.group(1)]
return self._get_video_info(
videoPlayer[0], query, referer=referer)
elif 'playerKey' in query:
@@ -484,8 +498,8 @@ class BrightcoveNewIE(InfoExtractor):
}]
@staticmethod
- def _extract_url(webpage):
- urls = BrightcoveNewIE._extract_urls(webpage)
+ def _extract_url(ie, webpage):
+ urls = BrightcoveNewIE._extract_urls(ie, webpage)
return urls[0] if urls else None
@staticmethod
@@ -508,7 +522,7 @@ class BrightcoveNewIE(InfoExtractor):
# [2] looks like:
for video, script_tag, account_id, player_id, embed in re.findall(
r'''(?isx)
- (<video\s+[^>]+>)
+ (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
(?:.*?
(<script[^>]+
src=["\'](?:https?:)?//players\.brightcove\.net/
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index f1f128c45..acd87e371 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor):
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Terrasses du Numérique',
'duration': 122,
},
- 'params': {
- 'skip_download': True, # Requires rtmpdump
- }
}, {
'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
'only_matching': True,
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
index cf678e7f8..87ad14e91 100644
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -96,6 +96,7 @@ class CBCIE(InfoExtractor):
'info_dict': {
'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
'id': 'dog-indoor-exercise-winter-1.3928238',
+ 'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
},
'playlist_mincount': 6,
}]
@@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor):
'uploader': 'CBCC-NEW',
},
}, {
- # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
'url': 'http://www.cbc.ca/player/play/2164402062',
- 'md5': '17a61eb813539abea40618d6323a7f82',
+ 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py
index 8d5f11dd1..7d78e3aae 100644
--- a/youtube_dl/extractor/cbslocal.py
+++ b/youtube_dl/extractor/cbslocal.py
@@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE):
'title': 'A Very Blue Anniversary',
'description': 'CBS2’s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
- 'timestamp': 1479962220,
- 'upload_date': '20161124',
+ 'timestamp': int,
+ 'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index 1ee35b501..78b7a923c 100755
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -9,7 +9,10 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ multipart_encode,
parse_duration,
+ random_birthday,
+ urljoin,
)
@@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):
'description': 'md5:269ccd135d550da90d1662651fcb9772',
'thumbnail': r're:^https?://.*\.jpg$',
'average_rating': float,
- 'duration': 39
+ 'duration': 39,
+ 'age_limit': 0,
}
}, {
'url': 'http://www.cda.pl/video/57413289',
@@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):
'uploader': 'crash404',
'view_count': int,
'average_rating': float,
- 'duration': 137
+ 'duration': 137,
+ 'age_limit': 0,
}
}, {
+ # Age-restricted
+ 'url': 'http://www.cda.pl/video/1273454c4',
+ 'info_dict': {
+ 'id': '1273454c4',
+ 'ext': 'mp4',
+ 'title': 'Bronson (2008) napisy HD 1080p',
+ 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+ 'height': 1080,
+ 'uploader': 'boniek61',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5554,
+ 'age_limit': 18,
+ 'view_count': int,
+ 'average_rating': float,
+ },
+ }, {
'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True,
}]
+ def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
+ form_data = random_birthday('rok', 'miesiac', 'dzien')
+ form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
+ data, content_type = multipart_encode(form_data)
+ return self._download_webpage(
+ urljoin(url, '/a/validatebirth'), video_id, *args,
+ data=data, headers={
+ 'Referer': url,
+ 'Content-Type': content_type,
+ }, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
self._set_cookie('cda.pl', 'cda.player', 'html5')
@@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True)
+ need_confirm_age = False
+ if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+ webpage, 'birthday validate form', default=None):
+ webpage = self._download_age_confirm_page(
+ url, video_id, note='Confirming age')
+ need_confirm_age = True
+
formats = []
uploader = self._search_regex(r'''(?x)
@@ -81,6 +120,7 @@ class CDAIE(InfoExtractor):
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'duration': None,
+ 'age_limit': 18 if need_confirm_age else 0,
}
def extract_format(page, version):
@@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):
for href, resolution in re.findall(
r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
webpage):
- webpage = self._download_webpage(
+ if need_confirm_age:
+ handler = self._download_age_confirm_page
+ else:
+ handler = self._download_webpage
+
+ webpage = handler(
self._BASE_URL + href, video_id,
'Downloading %s version information' % resolution, fatal=False)
if not webpage:
@@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):
# invalid version is requested.
self.report_warning('Unable to download %s version information' % resolution)
continue
+
extract_format(webpage, resolution)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index bb52e0c6f..0920f6219 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
- 'md5': '720563e467b86374c194bdead08d207d',
+ 'md5': 'b9a5dc46294154c1193e2d10e0c95693',
'info_dict': {
'id': '4343170',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
index 18c734766..6a41db87c 100644
--- a/youtube_dl/extractor/collegerama.py
+++ b/youtube_dl/extractor/collegerama.py
@@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
'description': '',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 7713.088,
'timestamp': 1413309600,
'upload_date': '20141014',
@@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'wmv',
'title': '64ste Vakantiecursus: Afvalwater',
'description': 'md5:7fd774865cc69d972f542b157c328305',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 10853,
'timestamp': 1326446400,
'upload_date': '20120113',
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index e54adc9f0..76b5378e9 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -245,6 +245,10 @@ class InfoExtractor(object):
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ chapters: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the chapter in seconds
+ * "end_time" - The end time of the chapter in seconds
+ * "title" (optional, string)
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -976,6 +980,23 @@ class InfoExtractor(object):
return info
if isinstance(json_ld, dict):
json_ld = [json_ld]
+
+ def extract_video_object(e):
+ assert e['@type'] == 'VideoObject'
+ info.update({
+ 'url': e.get('contentUrl'),
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
+ 'filesize': float_or_none(e.get('contentSize')),
+ 'tbr': int_or_none(e.get('bitrate')),
+ 'width': int_or_none(e.get('width')),
+ 'height': int_or_none(e.get('height')),
+ 'view_count': int_or_none(e.get('interactionCount')),
+ })
+
for e in json_ld:
if e.get('@context') == 'http://schema.org':
item_type = e.get('@type')
@@ -1000,18 +1021,11 @@ class InfoExtractor(object):
'description': unescapeHTML(e.get('articleBody')),
})
elif item_type == 'VideoObject':
- info.update({
- 'url': e.get('contentUrl'),
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('uploadDate')),
- 'filesize': float_or_none(e.get('contentSize')),
- 'tbr': int_or_none(e.get('bitrate')),
- 'width': int_or_none(e.get('width')),
- 'height': int_or_none(e.get('height')),
- })
+ extract_video_object(e)
+ elif item_type == 'WebPage':
+ video = e.get('video')
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+ extract_video_object(video)
break
return dict((k, v) for k, v in info.items() if v is not None)
@@ -1303,40 +1317,50 @@ class InfoExtractor(object):
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False):
-
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
+
if res is False:
return []
+
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
+ return self._parse_m3u8_formats(
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+ preference=preference, m3u8_id=m3u8_id, live=live)
+
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+ formats = []
format_url = lambda u: (
u
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- # We should try extracting formats only from master playlists [1], i.e.
- # playlists that describe available qualities. On the other hand media
- # playlists [2] should be returned as is since they contain just the media
- # without qualities renditions.
+ # References:
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+ # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+ # We should try extracting formats only from master playlists [1, 4.3.4],
+ # i.e. playlists that describe available qualities. On the other hand
+ # media playlists [1, 4.3.3] should be returned as is since they contain
+ # just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
- # playlist based on particular tags availability. As of [1, 2] master
- # playlist tags MUST NOT appear in a media playist and vice versa.
- # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
- # and MUST NOT appear in master playlist thus we can clearly detect media
- # playlist with this criterion.
- # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
- # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+ # master playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+ # media playlist and MUST NOT appear in master playlist thus we can
+ # clearly detect media playlist with this criterion.
+
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
@@ -1345,52 +1369,72 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}]
- audio_in_video_stream = {}
- last_info = {}
- last_media = {}
+
+ groups = {}
+ last_stream_inf = {}
+
+ def extract_media(x_media_line):
+ media = parse_m3u8_attributes(x_media_line)
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+ if not (media_type and group_id and name):
+ return
+ groups.setdefault(group_id, []).append(media)
+ if media_type not in ('VIDEO', 'AUDIO'):
+ return
+ media_url = media.get('URI')
+ if media_url:
+ format_id = []
+ for v in (group_id, name):
+ if v:
+ format_id.append(v)
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
+ 'language': media.get('LANGUAGE'),
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ formats.append(f)
+
+ def build_stream_name():
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ stream_name = last_stream_inf.get('NAME')
+ if stream_name:
+ return stream_name
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+ # from corresponding rendition group
+ stream_group_id = last_stream_inf.get('VIDEO')
+ if not stream_group_id:
+ return
+ stream_group = groups.get(stream_group_id)
+ if not stream_group:
+ return stream_group_id
+ rendition = stream_group[0]
+ return rendition.get('NAME') or stream_group_id
+
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = parse_m3u8_attributes(line)
+ last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
- media = parse_m3u8_attributes(line)
- media_type = media.get('TYPE')
- if media_type in ('VIDEO', 'AUDIO'):
- group_id = media.get('GROUP-ID')
- media_url = media.get('URI')
- if media_url:
- format_id = []
- for v in (group_id, media.get('NAME')):
- if v:
- format_id.append(v)
- f = {
- 'format_id': '-'.join(format_id),
- 'url': format_url(media_url),
- 'language': media.get('LANGUAGE'),
- 'ext': ext,
- 'protocol': entry_protocol,
- 'preference': preference,
- }
- if media_type == 'AUDIO':
- f['vcodec'] = 'none'
- if group_id and not audio_in_video_stream.get(group_id):
- audio_in_video_stream[group_id] = False
- formats.append(f)
- else:
- # When there is no URI in EXT-X-MEDIA let this tag's
- # data be used by regular URI lines below
- last_media = media
- if media_type == 'AUDIO' and group_id:
- audio_in_video_stream[group_id] = True
+ extract_media(line)
elif line.startswith('#') or not line.strip():
continue
else:
- tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+ tbr = float_or_none(
+ last_stream_inf.get('AVERAGE-BANDWIDTH') or
+ last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- # Despite specification does not mention NAME attribute for
- # EXT-X-STREAM-INF it still sometimes may be present
- stream_name = last_info.get('NAME') or last_media.get('NAME')
+ stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
@@ -1400,14 +1444,14 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
- 'manifest_url': manifest_url,
+ 'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
- 'fps': float_or_none(last_info.get('FRAME-RATE')),
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
- resolution = last_info.get('RESOLUTION')
+ resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
@@ -1423,13 +1467,26 @@ class InfoExtractor(object):
'vbr': vbr,
'abr': abr,
})
- f.update(parse_codecs(last_info.get('CODECS')))
- if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
- # TODO: update acodec for audio only formats with the same GROUP-ID
- f['acodec'] = 'none'
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing audio group an audio group, it represents
+ # a complete (with audio and video) format. So, for such cases
+ # we will ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
formats.append(f)
- last_info = {}
- last_media = {}
+ last_stream_inf = {}
return formats
@staticmethod
@@ -1803,7 +1860,7 @@ class InfoExtractor(object):
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
- 'tbr': int_or_none(bandwidth, 1000),
+ 'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
@@ -2182,7 +2239,7 @@ class InfoExtractor(object):
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+ r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
webpage)
if mobj:
try:
@@ -2258,11 +2315,17 @@ class InfoExtractor(object):
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ urls = []
formats = []
for source in jwplayer_sources_data:
- source_url = self._proto_relative_url(source['file'])
+ source_url = self._proto_relative_url(source.get('file'))
+ if not source_url:
+ continue
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
+ if source_url in urls:
+ continue
+ urls.append(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8':
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py
index 5fa1f006b..6ea03e65c 100644
--- a/youtube_dl/extractor/coub.py
+++ b/youtube_dl/extractor/coub.py
@@ -24,12 +24,11 @@ class CoubIE(InfoExtractor):
'duration': 4.6,
'timestamp': 1428527772,
'upload_date': '20150408',
- 'uploader': 'Артём Лоскутников',
+ 'uploader': 'Artyom Loskutnikov',
'uploader_id': 'artyom.loskutnikov',
'view_count': int,
'like_count': int,
'repost_count': int,
- 'comment_count': int,
'age_limit': 0,
},
}, {
@@ -118,7 +117,6 @@ class CoubIE(InfoExtractor):
view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
like_count = int_or_none(coub.get('likes_count'))
repost_count = int_or_none(coub.get('recoubs_count'))
- comment_count = int_or_none(coub.get('comments_count'))
age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
if age_restricted is not None:
@@ -137,7 +135,6 @@ class CoubIE(InfoExtractor):
'view_count': view_count,
'like_count': like_count,
'repost_count': repost_count,
- 'comment_count': comment_count,
'age_limit': age_limit,
'formats': formats,
}
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 2ed8b30bb..2ffa4a7f8 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': {
'id': '727589',
'ext': 'mp4',
- 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!",
+ 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!",
'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Kadokawa Pictures Inc.',
@@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'series': "KONOSUBA -God's blessing on this wonderful world!",
'season': "KONOSUBA -God's blessing on this wonderful world! 2",
'season_number': 2,
- 'episode': 'Give Me Deliverance from this Judicial Injustice!',
+ 'episode': 'Give Me Deliverance From This Judicial Injustice!',
'episode_number': 1,
},
'params': {
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 246efde43..441114d19 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -51,6 +51,24 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
_TESTS = [
{
+ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
+ 'md5': '074b95bdee76b9e3654137aee9c79dfe',
+ 'info_dict': {
+ 'id': 'x5kesuj',
+ 'ext': 'mp4',
+ 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
+ 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 187,
+ 'timestamp': 1493651285,
+ 'upload_date': '20170501',
+ 'uploader': 'Deadline',
+ 'uploader_id': 'x1xm8ri',
+ 'age_limit': 0,
+ 'view_count': int,
+ },
+ },
+ {
'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
'md5': '2137c41a8e78554bb09225b8eb322406',
'info_dict': {
@@ -66,7 +84,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'uploader_id': 'xijv66',
'age_limit': 0,
'view_count': int,
- }
+ },
+ 'skip': 'video gone',
},
# Vevo video
{
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
index bdfe638b4..5c9c0ecdc 100644
--- a/youtube_dl/extractor/democracynow.py
+++ b/youtube_dl/extractor/democracynow.py
@@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor):
'info_dict': {
'id': '2015-0703-001',
'ext': 'mp4',
- 'title': 'Daily Show',
+ 'title': 'Daily Show for July 03, 2015',
+ 'description': 'md5:80eb927244d6749900de6072c7cc2c86',
},
}, {
'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index 1f75352ca..148605c0b 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor):
'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',
'duration': 290,
'timestamp': 1476767794.2809999,
- 'upload_date': '20160525',
+ 'upload_date': '20161018',
'uploader': 'parthivi001',
'uploader_id': 'user52596202',
'view_count': int,
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 82d8a042f..d22133d24 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -20,7 +20,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': 'iseven',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -51,7 +51,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': '17732',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 1671090f4..c0020dd7d 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE
from .amcnetworks import AMCNetworksIE
from .animeondemand import AnimeOnDemandIE
from .anitube import AnitubeIE
+from .anvato import AnvatoIE
from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
@@ -87,7 +88,6 @@ from .azmedien import (
AZMedienPlaylistIE,
AZMedienShowPlaylistIE,
)
-from .azubu import AzubuIE, AzubuLiveIE
from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
@@ -663,6 +663,7 @@ from .nintendo import NintendoIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE
+from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
from .nova import NovaIE
@@ -939,6 +940,7 @@ from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamable import StreamableIE
+from .streamango import StreamangoIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -1233,7 +1235,10 @@ from .wrzuta import (
WrzutaIE,
WrzutaPlaylistIE,
)
-from .wsj import WSJIE
+from .wsj import (
+ WSJIE,
+ WSJArticleIE,
+)
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
@@ -1295,5 +1300,6 @@ from .youtube import (
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE
+from .zaq1 import Zaq1IE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
index a3bb98377..985542727 100644
--- a/youtube_dl/extractor/foxsports.py
+++ b/youtube_dl/extractor/foxsports.py
@@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TEST = {
- 'url': 'http://www.foxsports.com/video?vid=432609859715',
+ 'url': 'http://www.foxsports.com/tennessee/video/432609859715',
'md5': 'b49050e955bebe32c301972e4012ac17',
'info_dict': {
- 'id': 'i0qKWsk3qJaM',
+ 'id': 'bwduI3X_TgUB',
'ext': 'mp4',
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
'description': 'Courtney Lee talks about Memphis being focused.',
@@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config = self._parse_json(
- self._search_regex(
- r"data-player-config='([^']+)'", webpage, 'data player config'),
+ self._html_search_regex(
+ r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""",
+ webpage, 'data player config'),
video_id)
return self.url_result(smuggle_url(update_url_query(
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 81c0ce9a3..49409369c 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor):
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
source_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
bitrates.sort()
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 682c49e79..00d311158 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -78,8 +78,7 @@ class GameSpotIE(OnceIE):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index bc7c21f7a..b06f43446 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -85,6 +85,9 @@ from .ustream import UstreamIE
from .openload import OpenloadIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
+from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
class GenericIE(InfoExtractor):
@@ -430,6 +433,22 @@ class GenericIE(InfoExtractor):
},
},
{
+ # Brightcove video in <iframe>
+ 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724',
+ 'md5': '36d74ef5e37c8b4a2ce92880d208b968',
+ 'info_dict': {
+ 'id': '5360463607001',
+ 'ext': 'mp4',
+ 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活',
+ 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。',
+ 'uploader': 'United Nations',
+ 'uploader_id': '1362235914001',
+ 'timestamp': 1489593889,
+ 'upload_date': '20170315',
+ },
+ 'add_ie': ['BrightcoveLegacy'],
+ },
+ {
# Brightcove with alternative playerID key
'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',
'info_dict': {
@@ -1410,6 +1429,22 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # Brightcove embed with whitespace around attribute names
+ 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+ 'info_dict': {
+ 'id': '3167554373001',
+ 'ext': 'mp4',
+ 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+ 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+ 'uploader_id': '1079349493',
+ 'upload_date': '20140207',
+ 'timestamp': 1391810548,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Another form of arte.tv embed
{
'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1651,6 +1686,38 @@ class GenericIE(InfoExtractor):
},
'add_ie': [SenateISVPIE.ie_key()],
},
+ {
+ # Limelight embeds (1 channel embed + 4 media embeds)
+ 'url': 'http://www.sedona.com/FacilitatorTraining2017',
+ 'info_dict': {
+ 'id': 'FacilitatorTraining2017',
+ 'title': 'Facilitator Training 2017',
+ },
+ 'playlist_mincount': 5,
+ },
+ {
+ 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+ 'info_dict': {
+ 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+ 'title': 'Standoff with Walnut Creek murder suspect ends',
+ 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+ },
+ 'playlist_mincount': 4,
+ },
+ {
+ # WashingtonPost embed
+ 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+ 'info_dict': {
+ 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+ 'ext': 'mp4',
+ 'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+ 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+ 'timestamp': 1455216756,
+ 'uploader': 'The Washington Post',
+ 'upload_date': '20160211',
+ },
+ 'add_ie': [WashingtonPostIE.ie_key()],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1693,7 +1760,7 @@ class GenericIE(InfoExtractor):
continue
entries.append({
- '_type': 'url',
+ '_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
})
@@ -2483,6 +2550,11 @@ class GenericIE(InfoExtractor):
return self.url_result(piksel_url, PikselIE.ie_key())
# Look for Limelight embeds
+ limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
+ if limelight_urls:
+ return self.playlist_result(
+ limelight_urls, video_id, video_title, video_description)
+
mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
if mobj:
lm = {
@@ -2506,6 +2578,12 @@ class GenericIE(InfoExtractor):
'limelight:media:%s' % mobj.group('id'),
{'source_url': url}), 'LimelightMedia', mobj.group('id'))
+ # Look for Anvato embeds
+ anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+ if anvato_urls:
+ return self.playlist_result(
+ anvato_urls, video_id, video_title, video_description)
+
# Look for AdobeTVVideo embeds
mobj = re.search(
r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
@@ -2623,6 +2701,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key())
+ # Look for WashingtonPost embeds
+ wapo_urls = WashingtonPostIE._extract_urls(webpage)
+ if wapo_urls:
+ return self.playlist_from_matches(
+ wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 4c9be47b4..9c7b1bd37 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -36,22 +36,26 @@ class GoIE(AdobePassIE):
'requestor_id': 'DisneyXD',
}
}
- _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
+ _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{
- 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
+ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
- 'id': '0_g86w5onx',
+ 'id': 'VDKA3807643',
'ext': 'mp4',
- 'title': 'Sneak Peek: Language Arts',
- 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c',
+ 'title': 'The Traitor in the White House',
+ 'description': 'md5:05b009d2d145a1e85d25111bd37222e8',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
- 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',
- 'only_matching': True,
+ 'url': 'http://watchdisneyxd.go.com/doraemon',
+ 'info_dict': {
+ 'title': 'Doraemon',
+ 'id': 'SH55574025',
+ },
+ 'playlist_mincount': 51,
}, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
@@ -60,19 +64,36 @@ class GoIE(AdobePassIE):
'only_matching': True,
}]
+ def _extract_videos(self, brand, video_id='-1', show_id='-1'):
+ display_id = video_id if video_id != '-1' else show_id
+ return self._download_json(
+ 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id),
+ display_id)['video']
+
def _real_extract(self, url):
sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
+ site_info = self._SITE_INFO[sub_domain]
+ brand = site_info['brand']
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
# There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
# from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id')
- site_info = self._SITE_INFO[sub_domain]
- brand = site_info['brand']
- video_data = self._download_json(
- 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id),
- video_id)['video'][0]
+ r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None)
+ if not video_id:
+ # show extraction works for Disney, DisneyJunior and DisneyXD
+ # ABC and Freeform has different layout
+ show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id')
+ videos = self._extract_videos(brand, show_id=show_id)
+ show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False)
+ entries = []
+ for video in videos:
+ entries.append(self.url_result(
+ video['url'], 'Go', video.get('id'), video.get('title')))
+ entries.reverse()
+ return self.playlist_result(entries, show_id, show_title)
+ video_data = self._extract_videos(brand, video_id)[0]
+ video_id = video_data['id']
title = video_data['title']
formats = []
@@ -105,7 +126,7 @@ class GoIE(AdobePassIE):
self._initialize_geo_bypass(['US'])
entitlement = self._download_json(
'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
- video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers())
+ video_id, data=urlencode_postdata(data))
errors = entitlement.get('errors', {}).get('errors', [])
if errors:
for error in errors:
diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py
index 3550eca7c..9b2e1c164 100644
--- a/youtube_dl/extractor/go90.py
+++ b/youtube_dl/extractor/go90.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
int_or_none,
parse_iso8601,
)
@@ -18,7 +19,7 @@ class Go90IE(InfoExtractor):
'info_dict': {
'id': '84BUqjLpf9D',
'ext': 'mp4',
- 'title': 'Inside The Utah Coalition Against Pornography Convention',
+ 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention',
'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.',
'timestamp': 1491868800,
'upload_date': '20170411',
@@ -32,11 +33,28 @@ class Go90IE(InfoExtractor):
video_id, headers={
'Content-Type': 'application/json; charset=utf-8',
}, data=b'{"client":"web","device_type":"pc"}')
- title = video_data['title']
main_video_asset = video_data['main_video_asset']
+ episode_number = int_or_none(video_data.get('episode_number'))
+ series = None
+ season = None
+ season_id = None
+ season_number = None
+ for metadata in video_data.get('__children', {}).get('Item', {}).values():
+ if metadata.get('type') == 'show':
+ series = metadata.get('title')
+ elif metadata.get('type') == 'season':
+ season = metadata.get('title')
+ season_id = metadata.get('id')
+ season_number = int_or_none(metadata.get('season_number'))
+
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
thumbnails = []
formats = []
+ subtitles = {}
for asset in video_data.get('assets'):
if asset.get('id') == main_video_asset:
for source in asset.get('sources', []):
@@ -70,6 +88,15 @@ class Go90IE(InfoExtractor):
'height': int_or_none(source.get('height')),
'tbr': int_or_none(source.get('bitrate')),
})
+
+ for caption in asset.get('caption_metadata', []):
+ caption_url = caption.get('source_url')
+ if not caption_url:
+ continue
+ subtitles.setdefault(caption.get('language', 'en'), []).append({
+ 'url': caption_url,
+ 'ext': determine_ext(caption_url, 'vtt'),
+ })
elif asset.get('type') == 'image':
asset_location = asset.get('location')
if not asset_location:
@@ -89,4 +116,11 @@ class Go90IE(InfoExtractor):
'description': video_data.get('short_description'),
'like_count': int_or_none(video_data.get('like_count')),
'timestamp': parse_iso8601(video_data.get('released_at')),
+ 'series': series,
+ 'episode': episode,
+ 'season': season,
+ 'season_id': season_id,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 9fb71e8ef..fe425e786 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE):
def _extract_http_audio(self, webpage, video_id):
fields = self._hidden_inputs(webpage)
- http_audio_url = fields['filename']
- if http_audio_url is None:
+ http_audio_url = fields.get('filename')
+ if not http_audio_url:
return []
cookies_header = {'Cookie': self._extract_cookies(webpage)}
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index c1921cbcf..4667335e0 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -112,7 +112,8 @@ class InstagramIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
(video_url, description, thumbnail, timestamp, uploader,
- uploader_id, like_count, comment_count, height, width) = [None] * 10
+ uploader_id, like_count, comment_count, comments, height,
+ width) = [None] * 11
shared_data = self._parse_json(
self._search_regex(
@@ -121,7 +122,10 @@ class InstagramIE(InfoExtractor):
video_id, fatal=False)
if shared_data:
media = try_get(
- shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)
+ shared_data,
+ (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
+ lambda x: x['entry_data']['PostPage'][0]['media']),
+ dict)
if media:
video_url = media.get('video_url')
height = int_or_none(media.get('dimensions', {}).get('height'))
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index 2af6a6db4..fdfa7de9e 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -189,7 +189,11 @@ class IqiyiIE(InfoExtractor):
'only_matching': True,
}, {
'url': 'http://yule.iqiyi.com/pcb.html',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '4a0af228fddb55ec96398a364248ed7f',
+ 'ext': 'mp4',
+ 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰',
+ },
}, {
# VIP-only video. The first 2 parts (6 minutes) are available without login
# MD5 sums omitted as values are different on Travis CI and my machine
@@ -337,15 +341,18 @@ class IqiyiIE(InfoExtractor):
url, 'temp_id', note='download video page')
# There's no simple way to determine whether an URL is a playlist or not
- # So detect it
- playlist_result = self._extract_playlist(webpage)
- if playlist_result:
- return playlist_result
-
+ # Sometimes there are playlist links in individual videos, so treat it
+ # as a single video first
tvid = self._search_regex(
- r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+ r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None)
+ if tvid is None:
+ playlist_result = self._extract_playlist(webpage)
+ if playlist_result:
+ return playlist_result
+ raise ExtractorError('Can\'t find any video')
+
video_id = self._search_regex(
- r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+ r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
formats = []
for _ in range(5):
@@ -377,7 +384,8 @@ class IqiyiIE(InfoExtractor):
self._sort_formats(formats)
title = (get_element_by_id('widget-videotitle', webpage) or
- clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
+ clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or
+ self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py
index 021c6b278..f3156804d 100644
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@@ -116,13 +116,25 @@ class ITVIE(InfoExtractor):
if not play_path:
continue
tbr = int_or_none(media_file.get('bitrate'), 1000)
- formats.append({
+ f = {
'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
- 'url': rtmp_url,
'play_path': play_path,
+ # Providing this swfVfy allows to avoid truncated downloads
+ 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
+ 'page_url': url,
'tbr': tbr,
'ext': 'flv',
- })
+ }
+ app = self._search_regex(
+ 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
+ if app:
+ f.update({
+ 'url': rtmp_url.split('?', 1)[0],
+ 'app': app,
+ })
+ else:
+ f['url'] = rtmp_url
+ formats.append(f)
ios_playlist_url = params.get('data-video-playlist')
hmac = params.get('data-video-hmac')
@@ -172,7 +184,9 @@ class ITVIE(InfoExtractor):
href = ios_base_url + href
ext = determine_ext(href)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ href, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
else:
formats.append({
'url': href,
@@ -189,7 +203,8 @@ class ITVIE(InfoExtractor):
'ext': 'ttml' if ext == 'xml' else ext,
})
- return {
+ info = self._search_json_ld(webpage, video_id, default={})
+ info.update({
'id': video_id,
'title': title,
'formats': formats,
@@ -198,4 +213,5 @@ class ITVIE(InfoExtractor):
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'series': xpath_text(playlist, 'ProgrammeTitle'),
'duartion': parse_duration(xpath_text(playlist, 'Duration')),
- }
+ })
+ return info
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py
index 9eda956d2..0a07c1320 100644
--- a/youtube_dl/extractor/leeco.py
+++ b/youtube_dl/extractor/leeco.py
@@ -23,7 +23,6 @@ from ..utils import (
str_or_none,
url_basename,
urshift,
- update_url_query,
)
@@ -51,7 +50,7 @@ class LeIE(InfoExtractor):
'id': '1415246',
'ext': 'mp4',
'title': '美人天下01',
- 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+ 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
},
'params': {
'hls_prefer_native': True,
@@ -69,7 +68,6 @@ class LeIE(InfoExtractor):
'params': {
'hls_prefer_native': True,
},
- 'skip': 'Only available in China',
}, {
'url': 'http://sports.le.com/video/25737697.html',
'only_matching': True,
@@ -81,7 +79,7 @@ class LeIE(InfoExtractor):
'only_matching': True,
}]
- # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+ # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
def ror(self, param1, param2):
_loc3_ = 0
while _loc3_ < param2:
@@ -90,15 +88,8 @@ class LeIE(InfoExtractor):
return param1
def calc_time_key(self, param1):
- _loc2_ = 773625421
- _loc3_ = self.ror(param1, _loc2_ % 13)
- _loc3_ = _loc3_ ^ _loc2_
- _loc3_ = self.ror(_loc3_, _loc2_ % 17)
- return _loc3_
-
- # reversed from http://jstatic.letvcdn.com/sdk/player.js
- def get_mms_key(self, time):
- return self.ror(time, 8) ^ 185025305
+ _loc2_ = 185025305
+ return self.ror(param1, _loc2_ % 17) ^ _loc2_
# see M3U8Encryption class in KLetvPlayer.swf
@staticmethod
@@ -122,7 +113,7 @@ class LeIE(InfoExtractor):
def _check_errors(self, play_json):
# Check for errors
- playstatus = play_json['playstatus']
+ playstatus = play_json['msgs']['playstatus']
if playstatus['status'] == 0:
flag = playstatus['flag']
if flag == 1:
@@ -134,58 +125,31 @@ class LeIE(InfoExtractor):
media_id = self._match_id(url)
page = self._download_webpage(url, media_id)
- play_json_h5 = self._download_json(
- 'http://api.le.com/mms/out/video/playJsonH5',
- media_id, 'Downloading html5 playJson data', query={
- 'id': media_id,
- 'platid': 3,
- 'splatid': 304,
- 'format': 1,
- 'tkey': self.get_mms_key(int(time.time())),
- 'domain': 'www.le.com',
- 'tss': 'no',
- },
- headers=self.geo_verification_headers())
- self._check_errors(play_json_h5)
-
play_json_flash = self._download_json(
- 'http://api.le.com/mms/out/video/playJson',
+ 'http://player-pc.le.com/mms/out/video/playJson',
media_id, 'Downloading flash playJson data', query={
'id': media_id,
'platid': 1,
'splatid': 101,
'format': 1,
+ 'source': 1000,
'tkey': self.calc_time_key(int(time.time())),
'domain': 'www.le.com',
+ 'region': 'cn',
},
headers=self.geo_verification_headers())
self._check_errors(play_json_flash)
- def get_h5_urls(media_url, format_id):
- location = self._download_json(
- media_url, media_id,
- 'Download JSON metadata for format %s' % format_id, query={
- 'format': 1,
- 'expect': 3,
- 'tss': 'no',
- })['location']
-
- return {
- 'http': update_url_query(location, {'tss': 'no'}),
- 'hls': update_url_query(location, {'tss': 'ios'}),
- }
-
def get_flash_urls(media_url, format_id):
- media_url += '&' + compat_urllib_parse_urlencode({
- 'm3v': 1,
- 'format': 1,
- 'expect': 3,
- 'rateid': format_id,
- })
-
nodes_data = self._download_json(
media_url, media_id,
- 'Download JSON metadata for format %s' % format_id)
+ 'Download JSON metadata for format %s' % format_id,
+ query={
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'tss': 'ios',
+ })
req = self._request_webpage(
nodes_data['nodelist'][0]['location'], media_id,
@@ -199,29 +163,28 @@ class LeIE(InfoExtractor):
extracted_formats = []
formats = []
- for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
- playurl = play_json['playurl']
- play_domain = playurl['domain'][0]
-
- for format_id, format_data in playurl.get('dispatch', []).items():
- if format_id in extracted_formats:
- continue
- extracted_formats.append(format_id)
-
- media_url = play_domain + format_data[0]
- for protocol, format_url in get_urls(media_url, format_id).items():
- f = {
- 'url': format_url,
- 'ext': determine_ext(format_data[1]),
- 'format_id': '%s-%s' % (protocol, format_id),
- 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
- 'quality': int_or_none(format_id),
- }
-
- if format_id[-1:] == 'p':
- f['height'] = int_or_none(format_id[:-1])
-
- formats.append(f)
+ playurl = play_json_flash['msgs']['playurl']
+ play_domain = playurl['domain'][0]
+
+ for format_id, format_data in playurl.get('dispatch', []).items():
+ if format_id in extracted_formats:
+ continue
+ extracted_formats.append(format_id)
+
+ media_url = play_domain + format_data[0]
+ for protocol, format_url in get_flash_urls(media_url, format_id).items():
+ f = {
+ 'url': format_url,
+ 'ext': determine_ext(format_data[1]),
+ 'format_id': '%s-%s' % (protocol, format_id),
+ 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+ 'quality': int_or_none(format_id),
+ }
+
+ if format_id[-1:] == 'p':
+ f['height'] = int_or_none(format_id[:-1])
+
+ formats.append(f)
self._sort_formats(formats, ('height', 'quality', 'format_id'))
publish_time = parse_iso8601(self._html_search_regex(
diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py
index d3bca6435..b312e77f1 100644
--- a/youtube_dl/extractor/lego.py
+++ b/youtube_dl/extractor/lego.py
@@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor):
formats = self._extract_akamai_formats(
'%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none',
formats))
if len(m3u8_formats) == len(self._BITRATES):
self._sort_formats(m3u8_formats)
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
index f52c2e169..0a5a3956c 100644
--- a/youtube_dl/extractor/limelight.py
+++ b/youtube_dl/extractor/limelight.py
@@ -9,6 +9,7 @@ from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ smuggle_url,
unsmuggle_url,
ExtractorError,
)
@@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor):
_PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
_API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
+ @classmethod
+ def _extract_urls(cls, webpage, source_url):
+ lm = {
+ 'Media': 'media',
+ 'Channel': 'channel',
+ 'ChannelList': 'channel_list',
+ }
+ entries = []
+ for kind, video_id in re.findall(
+ r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
+ webpage):
+ entries.append(cls.url_result(
+ smuggle_url(
+ 'limelight:%s:%s' % (lm[kind], video_id),
+ {'source_url': source_url}),
+ 'Limelight%s' % kind, video_id))
+ for mobj in re.finditer(
+ # As per [1] class attribute should be exactly equal to
+ # LimelightEmbeddedPlayerFlash but numerous examples seen
+ # that don't exactly match it (e.g. [2]).
+ # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage
+ # 2. http://www.sedona.com/FacilitatorTraining2017
+ r'''(?sx)
+ <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*?
+ <param[^>]+
+ name=(["\'])flashVars\2[^>]+
+ value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32})
+ ''', webpage):
+ kind, video_id = mobj.group('kind'), mobj.group('id')
+ entries.append(cls.url_result(
+ smuggle_url(
+ 'limelight:%s:%s' % (kind, video_id),
+ {'source_url': source_url}),
+ 'Limelight%s' % kind.capitalize(), video_id))
+ return entries
+
def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
headers = {}
if referer:
diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py
new file mode 100644
index 000000000..f7fa098a5
--- /dev/null
+++ b/youtube_dl/extractor/noovo.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ smuggle_url,
+ try_get,
+)
+
+
+class NoovoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)'
+ _TESTS = [{
+ # clip
+ 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
+ 'info_dict': {
+ 'id': '5386045029001',
+ 'ext': 'mp4',
+ 'title': 'Chrysler Imperial',
+ 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
+ 'timestamp': 1491399228,
+ 'upload_date': '20170405',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': 'RPM+',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # episode
+ 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
+ 'info_dict': {
+ 'id': '5395865725001',
+ 'title': 'Épisode 13 : Les retrouvailles',
+ 'description': 'md5:336d5ebc5436534e61d16e63ddfca327',
+ 'ext': 'mp4',
+ 'timestamp': 1492019320,
+ 'upload_date': '20170412',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': "L'amour est dans le pré",
+ 'season_number': 5,
+ 'episode': 'Épisode 13',
+ 'episode_number': 13,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id,
+ video_id)['data']
+
+ content = try_get(data, lambda x: x['contents'][0])
+
+ brightcove_id = data.get('brightcoveId') or content['brightcoveId']
+
+ series = try_get(
+ data, (
+ lambda x: x['show']['title'],
+ lambda x: x['season']['show']['title']),
+ compat_str)
+
+ episode = None
+ og = data.get('og')
+ if isinstance(og, dict) and og.get('type') == 'video.episode':
+ episode = og.get('title')
+
+ video = content or data
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': BrightcoveNewIE.ie_key(),
+ 'url': smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['CA']}),
+ 'id': brightcove_id,
+ 'title': video.get('title'),
+ 'creator': video.get('source'),
+ 'view_count': int_or_none(video.get('viewsCount')),
+ 'series': series,
+ 'season_number': int_or_none(try_get(
+ data, lambda x: x['season']['seasonNumber'])),
+ 'episode': episode,
+ 'episode_number': int_or_none(data.get('episodeNumber')),
+ }
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index b6c5ee6e4..f26dafb8f 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -28,7 +28,7 @@ class NownessBaseIE(InfoExtractor):
bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)
if bc_url:
return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
- bc_url = BrightcoveNewIE._extract_url(player_code)
+ bc_url = BrightcoveNewIE._extract_url(self, player_code)
if bc_url:
return self.url_result(bc_url, BrightcoveNewIE.ie_key())
raise ExtractorError('Could not find player definition')
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index 0ee56a45b..854b6800c 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_parse_qs,
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
@@ -37,7 +38,7 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
# metadataUrl
'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
- 'md5': '9676cf86eff5391d35dea675d224e131',
+ 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',
'info_dict': {
'id': '63567059965189-0',
'ext': 'mp4',
@@ -53,7 +54,7 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
# YouTube embed (metadataUrl, provider == USER_YOUTUBE)
'url': 'http://ok.ru/video/64211978996595-1',
- 'md5': '5d7475d428845cd2e13bae6f1a992278',
+ 'md5': '2f206894ffb5dbfcce2c5a14b909eea5',
'info_dict': {
'id': '64211978996595-1',
'ext': 'mp4',
@@ -61,8 +62,8 @@ class OdnoklassnikiIE(InfoExtractor):
'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
'duration': 440,
'upload_date': '20150826',
- 'uploader_id': '750099571',
- 'uploader': 'Алина П',
+ 'uploader_id': 'tvroscosmos',
+ 'uploader': 'Телестудия Роскосмоса',
'age_limit': 0,
},
}, {
@@ -81,6 +82,7 @@ class OdnoklassnikiIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'Video has not been found',
}, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
'only_matching': True,
@@ -176,14 +178,32 @@ class OdnoklassnikiIE(InfoExtractor):
})
return info
- quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd', 'full'))
+ quality = qualities(('4', '0', '1', '2', '3', '5'))
formats = [{
'url': f['url'],
'ext': 'mp4',
'format_id': f['name'],
- 'quality': quality(f['name']),
} for f in metadata['videos']]
+
+ m3u8_url = metadata.get('hlsManifestUrl')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+
+ dash_manifest = metadata.get('metadataEmbedded')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(dash_manifest), 'mpd'))
+
+ for fmt in formats:
+ fmt_type = self._search_regex(
+ r'\btype[/=](\d)', fmt['url'],
+ 'format type', default=None)
+ if fmt_type:
+ fmt['quality'] = quality(fmt_type)
+
self._sort_formats(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 3e51b4dd7..0727e381b 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -8,6 +8,7 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
+ float_or_none,
js_to_json,
strip_jsonp,
strip_or_none,
@@ -464,6 +465,7 @@ class PBSIE(InfoExtractor):
redirects.append(redirect)
redirect_urls.add(redirect_url)
+ chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
@@ -479,6 +481,20 @@ class PBSIE(InfoExtractor):
extract_redirect_urls(video_info)
if not info:
info = video_info
+ if not chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ start_time = float_or_none(chapter.get('start_time'), 1000)
+ duration = float_or_none(chapter.get('duration'), 1000)
+ if start_time is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': chapter.get('title'),
+ })
formats = []
http_url = None
@@ -515,7 +531,7 @@ class PBSIE(InfoExtractor):
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
@@ -588,4 +604,5 @@ class PBSIE(InfoExtractor):
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
+ 'chapters': chapters,
}
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
index 073fc3e21..24c3600fe 100644
--- a/youtube_dl/extractor/porn91.py
+++ b/youtube_dl/extractor/porn91.py
@@ -1,10 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_urllib_parse_urlencode,
-)
from .common import InfoExtractor
from ..utils import (
parse_duration,
@@ -19,7 +15,7 @@ class Porn91IE(InfoExtractor):
_TEST = {
'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
- 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'md5': '7fcdb5349354f40d41689bd0fa8db05a',
'info_dict': {
'id': '7e42283b4f5ab36da134',
'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
@@ -43,24 +39,7 @@ class Porn91IE(InfoExtractor):
r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
title = title.replace('\n', '')
- # get real url
- file_id = self._search_regex(
- r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
- sec_code = self._search_regex(
- r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
- max_vid = self._search_regex(
- r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
- url_params = compat_urllib_parse_urlencode({
- 'VID': file_id,
- 'mp4': '1',
- 'seccode': sec_code,
- 'max_vid': max_vid,
- })
- info_cn = self._download_webpage(
- 'http://91porn.com/getfile.php?' + url_params, video_id,
- 'Downloading real video url')
- video_url = compat_urllib_parse_unquote(self._search_regex(
- r'file=([^&]+)&', info_cn, 'url'))
+ info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
duration = parse_duration(self._search_regex(
r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
@@ -68,11 +47,12 @@ class Porn91IE(InfoExtractor):
comment_count = int_or_none(self._search_regex(
r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
- return {
+ info_dict.update({
'id': video_id,
'title': title,
- 'url': video_url,
'duration': duration,
'comment_count': comment_count,
'age_limit': self._rta_search(webpage),
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
index ed38c77eb..e2202d603 100644
--- a/youtube_dl/extractor/r7.py
+++ b/youtube_dl/extractor/r7.py
@@ -62,8 +62,7 @@ class R7IE(InfoExtractor):
# m3u8 format always matches the http format, let's copy metadata from
# one to another
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- formats))
+ lambda f: f.get('vcodec') != 'none', formats))
if len(m3u8_formats) == 1:
f_copy = m3u8_formats[0].copy()
f_copy.update(f)
diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py
index 9f5c237ef..34725274e 100644
--- a/youtube_dl/extractor/streamable.py
+++ b/youtube_dl/extractor/streamable.py
@@ -12,7 +12,7 @@ from ..utils import (
class StreamableIE(InfoExtractor):
- _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)'
+ _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'
_TESTS = [
{
'url': 'https://streamable.com/dnd1',
@@ -47,6 +47,10 @@ class StreamableIE(InfoExtractor):
{
'url': 'https://streamable.com/e/dnd1',
'only_matching': True,
+ },
+ {
+ 'url': 'https://streamable.com/s/okkqk/drxjds',
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
new file mode 100644
index 000000000..aa4fad162
--- /dev/null
+++ b/youtube_dl/extractor/streamango.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
+
+
+class StreamangoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
+ 'md5': 'e992787515a182f55e38fc97588d802a',
+ 'info_dict': {
+ 'id': 'clapasobsptpkdfe',
+ 'ext': 'mp4',
+ 'title': '20170315_150006.mp4',
+ }
+ }, {
+ 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+
+ formats = []
+ for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
+ video = self._parse_json(
+ format_, video_id, transform_source=js_to_json, fatal=False)
+ if not video:
+ continue
+ src = video.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, default_ext=None)
+ if video.get('type') == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': ext or 'mp4',
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'tbr': int_or_none(video.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 1b1afab32..3f3c681ae 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -210,7 +210,7 @@ class TEDIE(InfoExtractor):
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index 06ea2b40a..c5b3288ad 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor):
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
formats.extend(m3u8_formats)
for i, m3u8_format in enumerate(m3u8_formats, 2):
http_url = '%s-%d.mp4' % (video_url_base, i)
diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py
index b6537141a..ebde6053f 100644
--- a/youtube_dl/extractor/tvplayer.py
+++ b/youtube_dl/extractor/tvplayer.py
@@ -2,9 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
extract_attributes,
+ try_get,
urlencode_postdata,
ExtractorError,
)
@@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):
webpage, 'channel element'))
title = current_channel['data-name']
- resource_id = self._search_regex(
- r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
- platform = self._search_regex(
- r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
+ resource_id = current_channel['data-id']
+
token = self._search_regex(
- r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
- validate = self._search_regex(
- r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
+ r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+ 'token', group='token')
+
+ context = self._download_json(
+ 'https://tvplayer.com/watch/context', display_id,
+ 'Downloading JSON context', query={
+ 'resource': resource_id,
+ 'nonce': token,
+ })
+
+ validate = context['validate']
+ platform = try_get(
+ context, lambda x: x['platform']['key'], compat_str) or 'firefox'
try:
response = self._download_json(
'http://api.tvplayer.com/api/v2/stream/live',
- resource_id, headers={
+ display_id, 'Downloading JSON stream', headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}, data=urlencode_postdata({
+ 'id': resource_id,
'service': 1,
'platform': platform,
- 'id': resource_id,
- 'token': token,
'validate': validate,
}))['tvplayer']['response']
except ExtractorError as e:
@@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
raise
- formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
+ formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 9aa38bc5a..890a149ea 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
from ..compat import (
@@ -11,7 +12,6 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
- sanitized_Request,
parse_iso8601,
)
@@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE):
}
def _initialize_api(self, video_id):
- req = sanitized_Request(
- 'http://www.vevo.com/auth', data=b'')
webpage = self._download_webpage(
- req, None,
+ 'https://accounts.vevo.com/token', None,
note='Retrieving oauth token',
- errnote='Unable to retrieve oauth token')
+ errnote='Unable to retrieve oauth token',
+ data=json.dumps({
+ 'client_id': 'SPupX1tvqFEopQ1YS6SS',
+ 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
self.raise_geo_restricted(
'%s said: This page is currently unavailable in your region' % self.IE_NAME)
auth_info = self._parse_json(webpage, video_id)
- self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
+ self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
def _call_api(self, path, *args, **kwargs):
try:
diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py
index 049db25a5..e5f964d39 100644
--- a/youtube_dl/extractor/videopress.py
+++ b/youtube_dl/extractor/videopress.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import random
import re
from .common import InfoExtractor
@@ -11,6 +10,7 @@ from ..utils import (
float_or_none,
parse_age_limit,
qualities,
+ random_birthday,
try_get,
unified_timestamp,
urljoin,
@@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ query = random_birthday('birth_year', 'birth_month', 'birth_day')
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
- video_id, query={
- 'birth_month': random.randint(1, 12),
- 'birth_day': random.randint(1, 31),
- 'birth_year': random.randint(1950, 1995),
- })
+ video_id, query=query)
title = video['title']
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
index 4e4b4e38c..701bb1d01 100644
--- a/youtube_dl/extractor/vidio.py
+++ b/youtube_dl/extractor/vidio.py
@@ -49,8 +49,11 @@ class VidioIE(InfoExtractor):
thumbnail = clip.get('image')
m3u8_url = m3u8_url or self._search_regex(
- r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url')
- formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1',
+ webpage, 'hls url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ self._sort_formats(formats)
duration = int_or_none(duration or self._search_regex(
r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index d0556297e..e64873bce 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -42,14 +42,15 @@ class VidziIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
- packed_codes = [mobj.group(0) for mobj in re.finditer(
- PACKED_CODES_RE, webpage)]
- for num, pc in enumerate(packed_codes, 1):
- code = decode_packed_codes(pc).replace('\\\'', '\'')
+ codes = [webpage]
+ codes.extend([
+ decode_packed_codes(mobj.group(0)).replace('\\\'', '\'')
+ for mobj in re.finditer(PACKED_CODES_RE, webpage)])
+ for num, code in enumerate(codes, 1):
jwplayer_data = self._parse_json(
self._search_regex(
r'setup\(([^)]+)\)', code, 'jwplayer data',
- default=NO_DEFAULT if num == len(packed_codes) else '{}'),
+ default=NO_DEFAULT if num == len(codes) else '{}'),
video_id, transform_source=js_to_json)
if jwplayer_data:
break
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index fcf0cb100..d5d5b4c69 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 839cad986..625d0a1cc 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -13,6 +13,7 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
IE_NAME = 'washingtonpost'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_TEST = {
'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor):
},
}
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return re.findall(
+ r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index deb7483ae..45cfca7c5 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -10,12 +10,14 @@ from ..utils import (
class WSJIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:
- video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
- (?:www\.)?wsj\.com/video/[^/]+/
- )
- (?P<id>[a-zA-Z0-9-]+)'''
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+ https?://(?:www\.)?wsj\.com/video/[^/]+/|
+ wsj:
+ )
+ (?P<id>[a-fA-F0-9-]{36})
+ '''
IE_DESC = 'Wall Street Journal'
_TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- api_url = (
- 'http://video-api.wsj.com/api-video/find_all_videos.asp?'
- 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
- 'thumbnailList,author,description,name,duration,videoURL,'
- 'titletag,formattedCreationDate,keywords,editor' % video_id)
- info = self._download_json(api_url, video_id)['items'][0]
+ info = self._download_json(
+ 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+ query={
+ 'type': 'guid',
+ 'count': 1,
+ 'query': video_id,
+ 'fields': ','.join((
+ 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+ 'description', 'name', 'duration', 'videoURL', 'titletag',
+ 'formattedCreationDate', 'keywords', 'editor')),
+ })['items'][0]
title = info.get('name', info.get('titletag'))
formats = []
@@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):
'title': title,
'categories': info.get('keywords'),
}
+
+
+class WSJArticleIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+ 'info_dict': {
+ 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+ 'ext': 'mp4',
+ 'upload_date': '20170221',
+ 'uploader_id': 'ralcaraz',
+ 'title': 'Bao Bao the Panda Leaves for China',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ video_id = self._search_regex(
+ r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+ return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index 1b5cd122a..13f8be6cb 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -17,24 +17,24 @@ from ..utils import (
class XFileShareIE(InfoExtractor):
_SITES = (
- ('daclips.in', 'DaClips'),
- ('filehoot.com', 'FileHoot'),
- ('gorillavid.in', 'GorillaVid'),
- ('movpod.in', 'MovPod'),
- ('powerwatch.pw', 'PowerWatch'),
- ('rapidvideo.ws', 'Rapidvideo.ws'),
- ('thevideobee.to', 'TheVideoBee'),
- ('vidto.me', 'Vidto'),
- ('streamin.to', 'Streamin.To'),
- ('xvidstage.com', 'XVIDSTAGE'),
- ('vidabc.com', 'Vid ABC'),
- ('vidbom.com', 'VidBom'),
- ('vidlo.us', 'vidlo'),
+ (r'daclips\.(?:in|com)', 'DaClips'),
+ (r'filehoot\.com', 'FileHoot'),
+ (r'gorillavid\.(?:in|com)', 'GorillaVid'),
+ (r'movpod\.in', 'MovPod'),
+ (r'powerwatch\.pw', 'PowerWatch'),
+ (r'rapidvideo\.ws', 'Rapidvideo.ws'),
+ (r'thevideobee\.to', 'TheVideoBee'),
+ (r'vidto\.me', 'Vidto'),
+ (r'streamin\.to', 'Streamin.To'),
+ (r'xvidstage\.com', 'XVIDSTAGE'),
+ (r'vidabc\.com', 'Vid ABC'),
+ (r'vidbom\.com', 'VidBom'),
+ (r'vidlo\.us', 'vidlo'),
)
IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
_VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
- % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0]))
+ % '|'.join(site for site in list(zip(*_SITES))[0]))
_FILE_NOT_FOUND_REGEXES = (
r'>(?:404 - )?File Not Found<',
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 5584674a0..bea9b87ad 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ js_to_json,
orderedSet,
parse_duration,
sanitized_Request,
@@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
}, {
+ # FLV videos with duplicated formats
+ 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+ 'md5': 'a406963eb349dd43692ec54631efd88b',
+ 'info_dict': {
+ 'id': '9299752',
+ 'display_id': 'A-Super-Run-Part-1-YT',
+ 'ext': 'flv',
+ 'title': 'A Super Run - Part 1 (YT)',
+ 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+ 'uploader': 'tshirtguy59',
+ 'duration': 579,
+ 'view_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ },
+ }, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
'only_matching': True,
@@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):
})
sources = self._parse_json(self._search_regex(
- r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
- webpage, 'sources', group='sources'), video_id)
+ r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+ webpage, 'sources', group='sources'), video_id,
+ transform_source=js_to_json)
formats = []
for format_id, format_url in sources.items():
@@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):
'format_id': format_id,
'height': int_or_none(format_id),
})
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
title = self._search_regex(
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 30825daae..eca603028 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -6,8 +6,10 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
clean_html,
- ExtractorError,
determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_duration,
)
@@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor):
'id': '4588838',
'ext': 'mp4',
'title': 'Biker Takes his Girl',
+ 'duration': 108,
'age_limit': 18,
}
}
@@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor):
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ video_duration = int_or_none(self._og_search_property(
+ 'duration', webpage, default=None)) or parse_duration(
+ self._search_regex(
+ r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+ webpage, 'duration', fatal=False))
formats = []
@@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor):
'id': video_id,
'formats': formats,
'title': video_title,
+ 'duration': video_duration,
'thumbnail': video_thumbnail,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 4951414e9..38f82bf44 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -258,7 +258,7 @@ class YahooIE(InfoExtractor):
return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
# Look for Brightcove New Studio embeds
- bc_url = BrightcoveNewIE._extract_url(webpage)
+ bc_url = BrightcoveNewIE._extract_url(self, webpage)
if bc_url:
return self.url_result(bc_url, BrightcoveNewIE.ie_key())
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index fd6268ba4..eb1062142 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'overembed': 'false',
})['playlist']
- tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+ tracks = playlist['tracks']
+ track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
# tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
# missing tracks should be retrieved manually.
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 9e2b9115c..480f403da 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -963,7 +963,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -1629,7 +1629,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py
new file mode 100644
index 000000000..889aff5d8
--- /dev/null
+++ b/youtube_dl/extractor/zaq1.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://zaq1.pl/video/xev0e',
+ 'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+ 'info_dict': {
+ 'id': 'xev0e',
+ 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+ 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+ 'ext': 'mp4',
+ 'duration': 511,
+ 'timestamp': 1490896361,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170330',
+ 'view_count': int,
+ }
+ }, {
+ # malformed JSON-LD
+ 'url': 'http://zaq1.pl/video/x81vn',
+ 'info_dict': {
+ 'id': 'x81vn',
+ 'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+ 'ext': 'mp4',
+ 'duration': 6234,
+ 'timestamp': 1493494860,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170429',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'video url', group='url')
+
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+
+ def extract_data(field, name, fatal=False):
+ return self._search_regex(
+ r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+ webpage, field, fatal=fatal, group='field')
+
+ if not info.get('title'):
+ info['title'] = extract_data('file-name', 'title', fatal=True)
+
+ if not info.get('duration'):
+ info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+ if not info.get('thumbnail'):
+ info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+ if not info.get('timestamp'):
+ info['timestamp'] = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp'))
+
+ if not info.get('interactionCount'):
+ info['view_count'] = int_or_none(self._html_search_meta(
+ 'interactionCount', webpage, 'view count'))
+
+ uploader = self._html_search_regex(
+ r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+ fatal=False)
+
+ width = int_or_none(self._html_search_meta(
+ 'width', webpage, fatal=False))
+ height = int_or_none(self._html_search_meta(
+ 'height', webpage, fatal=False))
+
+ info.update({
+ 'id': video_id,
+ 'formats': [{
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'Referer': url,
+ },
+ }],
+ 'uploader': uploader,
+ })
+
+ return info
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 2d2f5e47b..52309fb84 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -469,6 +469,10 @@ def parseOpts(overrideArguments=None):
action='store_false', dest='skip_unavailable_fragments',
help='Abort downloading when some fragment is not available')
downloader.add_option(
+ '--keep-fragments',
+ action='store_true', dest='keep_fragments', default=False,
+ help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default')
+ downloader.add_option(
'--buffer-size',
dest='buffersize', metavar='SIZE', default='1024',
help='Size of download buffer (e.g. 1024 or 16K) (default is %default)')
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 665109558..c91ec8588 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -4,6 +4,7 @@ import io
import os
import subprocess
import time
+import re
from .common import AudioConversionError, PostProcessor
@@ -22,6 +23,7 @@ from ..utils import (
subtitles_filename,
dfxp2srt,
ISO639Utils,
+ replace_extension,
)
@@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
+ in_filenames = [filename]
+ options = []
if info['ext'] == 'm4a':
- options = ['-vn', '-acodec', 'copy']
+ options.extend(['-vn', '-acodec', 'copy'])
else:
- options = ['-c', 'copy']
+ options.extend(['-c', 'copy'])
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
+ chapters = info.get('chapters', [])
+ if chapters:
+ metadata_filename = encodeFilename(replace_extension(filename, 'meta'))
+ with io.open(metadata_filename, 'wt', encoding='utf-8') as f:
+ def ffmpeg_escape(text):
+ return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text)
+
+ metadata_file_content = ';FFMETADATA1\n'
+ for chapter in chapters:
+ metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n'
+ metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000)
+ metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000)
+ chapter_title = chapter.get('title')
+ if chapter_title:
+ metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title)
+ f.write(metadata_file_content)
+ in_filenames.append(metadata_filename)
+ options.extend(['-map_metadata', '1'])
+
self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
- self.run_ffmpeg(filename, temp_filename, options)
+ self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options)
+ if chapters:
+ os.remove(metadata_filename)
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
return [], info
diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py
index 0f5d7bdb2..5d4adbe72 100644
--- a/youtube_dl/socks.py
+++ b/youtube_dl/socks.py
@@ -193,9 +193,10 @@ class sockssocket(socket.socket):
self._check_response_version(SOCKS5_VERSION, version)
- if method == Socks5Auth.AUTH_NO_ACCEPTABLE:
+ if method == Socks5Auth.AUTH_NO_ACCEPTABLE or (
+ method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)):
self.close()
- raise Socks5Error(method)
+ raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE)
if method == Socks5Auth.AUTH_USER_PASS:
username = self._proxy.username.encode('utf-8')
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 84aaac664..c67f95ac9 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -11,6 +11,7 @@ import contextlib
import ctypes
import datetime
import email.utils
+import email.header
import errno
import functools
import gzip
@@ -421,8 +422,8 @@ def clean_html(html):
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
- html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+ html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
@@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True):
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+ # Remove unrecognized timezones from ISO 8601 alike timestamps
+ m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+ if m:
+ date_str = date_str[:-len(m.group('tz'))]
+
for expression in date_formats(day_first):
try:
dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
@@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
return new_req
+def try_multipart_encode(data, boundary):
+ content_type = 'multipart/form-data; boundary=%s' % boundary
+
+ out = b''
+ for k, v in data.items():
+ out += b'--' + boundary.encode('ascii') + b'\r\n'
+ if isinstance(k, compat_str):
+ k = k.encode('utf-8')
+ if isinstance(v, compat_str):
+ v = v.encode('utf-8')
+ # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+ # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+ content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n'
+ if boundary.encode('ascii') in content:
+ raise ValueError('Boundary overlaps with data')
+ out += content
+
+ out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+ return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+ '''
+ Encode a dict to RFC 7578-compliant form-data
+
+ data:
+ A dict where keys and values can be either Unicode or bytes-like
+ objects.
+ boundary:
+ If specified a Unicode object, it's used as the boundary. Otherwise
+ a random boundary is generated.
+
+ Reference: https://tools.ietf.org/html/rfc7578
+ '''
+ has_specified_boundary = boundary is not None
+
+ while True:
+ if boundary is None:
+ boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+ try:
+ out, content_type = try_multipart_encode(data, boundary)
+ break
+ except ValueError:
+ if has_specified_boundary:
+ raise
+ boundary = None
+
+ return out, content_type
+
+
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys:
@@ -2103,13 +2161,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
def try_get(src, getter, expected_type=None):
- try:
- v = getter(src)
- except (AttributeError, KeyError, TypeError, IndexError):
- pass
- else:
- if expected_type is None or isinstance(v, expected_type):
- return v
+ if not isinstance(getter, (list, tuple)):
+ getter = [getter]
+ for get in getter:
+ try:
+ v = get(src)
+ except (AttributeError, KeyError, TypeError, IndexError):
+ pass
+ else:
+ if expected_type is None or isinstance(v, expected_type):
+ return v
def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
@@ -2270,10 +2331,8 @@ def mimetype2ext(mt):
return {
'3gpp': '3gp',
'smptett+xml': 'tt',
- 'srt': 'srt',
'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
- 'vtt': 'vtt',
'x-flv': 'flv',
'x-mp4-fragmented': 'mp4',
'x-ms-wmv': 'wmv',
@@ -2281,11 +2340,11 @@ def mimetype2ext(mt):
'x-mpegurl': 'm3u8',
'vnd.apple.mpegurl': 'm3u8',
'dash+xml': 'mpd',
- 'f4m': 'f4m',
'f4m+xml': 'f4m',
'hds+xml': 'f4m',
'vnd.ms-sstr+xml': 'ism',
'quicktime': 'mov',
+ 'mp2t': 'ts',
}.get(res, res)
@@ -2508,27 +2567,97 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data):
+ LEGACY_NAMESPACES = (
+ ('http://www.w3.org/ns/ttml', [
+ 'http://www.w3.org/2004/11/ttaf1',
+ 'http://www.w3.org/2006/04/ttaf1',
+ 'http://www.w3.org/2006/10/ttaf1',
+ ]),
+ ('http://www.w3.org/ns/ttml#styling', [
+ 'http://www.w3.org/ns/ttml#style',
+ ]),
+ )
+
+ SUPPORTED_STYLING = [
+ 'color',
+ 'fontFamily',
+ 'fontSize',
+ 'fontStyle',
+ 'fontWeight',
+ 'textDecoration'
+ ]
+
_x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
- 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
- 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
+ 'tts': 'http://www.w3.org/ns/ttml#styling',
})
+ styles = {}
+ default_style = {}
+
class TTMLPElementParser(object):
- out = ''
+ _out = ''
+ _unclosed_elements = []
+ _applied_styles = []
def start(self, tag, attrib):
- if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
- self.out += '\n'
+ if tag in (_x('ttml:br'), 'br'):
+ self._out += '\n'
+ else:
+ unclosed_elements = []
+ style = {}
+ element_style_id = attrib.get('style')
+ if default_style:
+ style.update(default_style)
+ if element_style_id:
+ style.update(styles.get(element_style_id, {}))
+ for prop in SUPPORTED_STYLING:
+ prop_val = attrib.get(_x('tts:' + prop))
+ if prop_val:
+ style[prop] = prop_val
+ if style:
+ font = ''
+ for k, v in sorted(style.items()):
+ if self._applied_styles and self._applied_styles[-1].get(k) == v:
+ continue
+ if k == 'color':
+ font += ' color="%s"' % v
+ elif k == 'fontSize':
+ font += ' size="%s"' % v
+ elif k == 'fontFamily':
+ font += ' face="%s"' % v
+ elif k == 'fontWeight' and v == 'bold':
+ self._out += '<b>'
+ unclosed_elements.append('b')
+ elif k == 'fontStyle' and v == 'italic':
+ self._out += '<i>'
+ unclosed_elements.append('i')
+ elif k == 'textDecoration' and v == 'underline':
+ self._out += '<u>'
+ unclosed_elements.append('u')
+ if font:
+ self._out += '<font' + font + '>'
+ unclosed_elements.append('font')
+ applied_style = {}
+ if self._applied_styles:
+ applied_style.update(self._applied_styles[-1])
+ applied_style.update(style)
+ self._applied_styles.append(applied_style)
+ self._unclosed_elements.append(unclosed_elements)
def end(self, tag):
- pass
+ if tag not in (_x('ttml:br'), 'br'):
+ unclosed_elements = self._unclosed_elements.pop()
+ for element in reversed(unclosed_elements):
+ self._out += '</%s>' % element
+ if unclosed_elements and self._applied_styles:
+ self._applied_styles.pop()
def data(self, data):
- self.out += data
+ self._out += data
def close(self):
- return self.out.strip()
+ return self._out.strip()
def parse_node(node):
target = TTMLPElementParser()
@@ -2536,13 +2665,45 @@ def dfxp2srt(dfxp_data):
parser.feed(xml.etree.ElementTree.tostring(node))
return parser.close()
+ for k, v in LEGACY_NAMESPACES:
+ for ns in v:
+ dfxp_data = dfxp_data.replace(ns, k)
+
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
+ repeat = False
+ while True:
+ for style in dfxp.findall(_x('.//ttml:style')):
+ style_id = style.get('id')
+ parent_style_id = style.get('style')
+ if parent_style_id:
+ if parent_style_id not in styles:
+ repeat = True
+ continue
+ styles[style_id] = styles[parent_style_id].copy()
+ for prop in SUPPORTED_STYLING:
+ prop_val = style.get(_x('tts:' + prop))
+ if prop_val:
+ styles.setdefault(style_id, {})[prop] = prop_val
+ if repeat:
+ repeat = False
+ else:
+ break
+
+ for p in ('body', 'div'):
+ ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
+ if ele is None:
+ continue
+ style = styles.get(ele.get('style'))
+ if not style:
+ continue
+ default_style.update(style)
+
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
end_time = parse_dfxp_time_expr(para.attrib.get('end'))
@@ -3862,3 +4023,10 @@ class PhantomJSwrapper(object):
return (html, encodeArgument(out))
+
+def random_birthday(year_field, month_field, day_field):
+ return {
+ year_field: str(random.randint(1950, 1995)),
+ month_field: str(random.randint(1, 12)),
+ day_field: str(random.randint(1, 31)),
+ }
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 612b50f7b..c19ac49b0 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2017.04.15'
+__version__ = '2017.05.01'