aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py478
-rw-r--r--youtube_dl/__init__.py17
-rw-r--r--youtube_dl/aes.py2
-rw-r--r--youtube_dl/compat.py115
-rw-r--r--youtube_dl/downloader/__init__.py2
-rw-r--r--youtube_dl/downloader/dash.py66
-rw-r--r--youtube_dl/downloader/external.py40
-rw-r--r--youtube_dl/downloader/f4m.py98
-rw-r--r--youtube_dl/downloader/fragment.py111
-rw-r--r--youtube_dl/downloader/hls.py81
-rw-r--r--youtube_dl/downloader/http.py19
-rw-r--r--youtube_dl/downloader/rtmp.py2
-rw-r--r--youtube_dl/extractor/__init__.py187
-rw-r--r--youtube_dl/extractor/abc.py51
-rw-r--r--youtube_dl/extractor/adobetv.py60
-rw-r--r--youtube_dl/extractor/aftenposten.py77
-rw-r--r--youtube_dl/extractor/aftonbladet.py11
-rw-r--r--youtube_dl/extractor/appleconnect.py50
-rw-r--r--youtube_dl/extractor/archiveorg.py2
-rw-r--r--youtube_dl/extractor/ard.py203
-rw-r--r--youtube_dl/extractor/arte.py5
-rw-r--r--youtube_dl/extractor/baidu.py1
-rw-r--r--youtube_dl/extractor/bbc.py780
-rw-r--r--youtube_dl/extractor/bbccouk.py380
-rw-r--r--youtube_dl/extractor/bet.py17
-rw-r--r--youtube_dl/extractor/bild.py9
-rw-r--r--youtube_dl/extractor/bilibili.py33
-rw-r--r--youtube_dl/extractor/bliptv.py66
-rw-r--r--youtube_dl/extractor/br.py39
-rw-r--r--youtube_dl/extractor/breakcom.py1
-rw-r--r--youtube_dl/extractor/brightcove.py34
-rw-r--r--youtube_dl/extractor/byutv.py2
-rw-r--r--youtube_dl/extractor/canalplus.py26
-rw-r--r--youtube_dl/extractor/cbs.py23
-rw-r--r--youtube_dl/extractor/cbsnews.py2
-rw-r--r--youtube_dl/extractor/ccc.py4
-rw-r--r--youtube_dl/extractor/ceskatelevize.py3
-rw-r--r--youtube_dl/extractor/chilloutzone.py2
-rw-r--r--youtube_dl/extractor/cinemassacre.py18
-rw-r--r--youtube_dl/extractor/clipfish.py67
-rw-r--r--youtube_dl/extractor/clipsyndicate.py14
-rw-r--r--youtube_dl/extractor/cnet.py19
-rw-r--r--youtube_dl/extractor/cnn.py2
-rw-r--r--youtube_dl/extractor/comcarcoff.py2
-rw-r--r--youtube_dl/extractor/common.py400
-rw-r--r--youtube_dl/extractor/crunchyroll.py87
-rw-r--r--youtube_dl/extractor/ctsnews.py1
-rw-r--r--youtube_dl/extractor/dailymotion.py242
-rw-r--r--youtube_dl/extractor/dcn.py84
-rw-r--r--youtube_dl/extractor/dfb.py27
-rw-r--r--youtube_dl/extractor/dhm.py30
-rw-r--r--youtube_dl/extractor/discovery.py52
-rw-r--r--youtube_dl/extractor/douyutv.py1
-rw-r--r--youtube_dl/extractor/dramafever.py216
-rw-r--r--youtube_dl/extractor/drbonanza.py12
-rw-r--r--youtube_dl/extractor/drtuber.py21
-rw-r--r--youtube_dl/extractor/drtv.py37
-rw-r--r--youtube_dl/extractor/dumpert.py2
-rw-r--r--youtube_dl/extractor/ehow.py6
-rw-r--r--youtube_dl/extractor/empflix.py25
-rw-r--r--youtube_dl/extractor/eroprofile.py12
-rw-r--r--youtube_dl/extractor/escapist.py52
-rw-r--r--youtube_dl/extractor/espn.py55
-rw-r--r--youtube_dl/extractor/esri.py74
-rw-r--r--youtube_dl/extractor/facebook.py21
-rw-r--r--youtube_dl/extractor/faz.py21
-rw-r--r--youtube_dl/extractor/fc2.py2
-rw-r--r--youtube_dl/extractor/firedrive.py80
-rw-r--r--youtube_dl/extractor/fivetv.py88
-rw-r--r--youtube_dl/extractor/folketinget.py4
-rw-r--r--youtube_dl/extractor/fourtube.py1
-rw-r--r--youtube_dl/extractor/foxnews.py15
-rw-r--r--youtube_dl/extractor/foxsports.py2
-rw-r--r--youtube_dl/extractor/francetv.py91
-rw-r--r--youtube_dl/extractor/funnyordie.py2
-rw-r--r--youtube_dl/extractor/gamespot.py66
-rw-r--r--youtube_dl/extractor/gdcvault.py33
-rw-r--r--youtube_dl/extractor/generic.py603
-rw-r--r--youtube_dl/extractor/gfycat.py28
-rw-r--r--youtube_dl/extractor/giga.py3
-rw-r--r--youtube_dl/extractor/gorillavid.py15
-rw-r--r--youtube_dl/extractor/hentaistigma.py11
-rw-r--r--youtube_dl/extractor/historicfilms.py3
-rw-r--r--youtube_dl/extractor/hostingbulk.py6
-rw-r--r--youtube_dl/extractor/howcast.py35
-rw-r--r--youtube_dl/extractor/howstuffworks.py6
-rw-r--r--youtube_dl/extractor/imdb.py2
-rw-r--r--youtube_dl/extractor/imgur.py31
-rw-r--r--youtube_dl/extractor/ina.py2
-rw-r--r--youtube_dl/extractor/indavideo.py142
-rw-r--r--youtube_dl/extractor/infoq.py18
-rw-r--r--youtube_dl/extractor/instagram.py19
-rw-r--r--youtube_dl/extractor/iprima.py16
-rw-r--r--youtube_dl/extractor/iqiyi.py273
-rw-r--r--youtube_dl/extractor/ir90tv.py42
-rw-r--r--youtube_dl/extractor/izlesene.py18
-rw-r--r--youtube_dl/extractor/jeuxvideo.py9
-rw-r--r--youtube_dl/extractor/kaltura.py34
-rw-r--r--youtube_dl/extractor/kanalplay.py5
-rw-r--r--youtube_dl/extractor/karaoketv.py4
-rw-r--r--youtube_dl/extractor/karrierevideos.py96
-rw-r--r--youtube_dl/extractor/kickstarter.py15
-rw-r--r--youtube_dl/extractor/kontrtube.py40
-rw-r--r--youtube_dl/extractor/krasview.py3
-rw-r--r--youtube_dl/extractor/kuwo.py314
-rw-r--r--youtube_dl/extractor/lecture2go.py62
-rw-r--r--youtube_dl/extractor/letv.py8
-rw-r--r--youtube_dl/extractor/libsyn.py30
-rw-r--r--youtube_dl/extractor/lifenews.py130
-rw-r--r--youtube_dl/extractor/liveleak.py16
-rw-r--r--youtube_dl/extractor/livestream.py22
-rw-r--r--youtube_dl/extractor/lynda.py27
-rw-r--r--youtube_dl/extractor/mailru.py2
-rw-r--r--youtube_dl/extractor/malemotion.py6
-rw-r--r--youtube_dl/extractor/mdr.py2
-rw-r--r--youtube_dl/extractor/metacafe.py3
-rw-r--r--youtube_dl/extractor/mitele.py12
-rw-r--r--youtube_dl/extractor/mixcloud.py6
-rw-r--r--youtube_dl/extractor/mlb.py43
-rw-r--r--youtube_dl/extractor/mofosex.py4
-rw-r--r--youtube_dl/extractor/moniker.py18
-rw-r--r--youtube_dl/extractor/mtv.py74
-rw-r--r--youtube_dl/extractor/mwave.py58
-rw-r--r--youtube_dl/extractor/myspass.py3
-rw-r--r--youtube_dl/extractor/myvi.py60
-rw-r--r--youtube_dl/extractor/myvideo.py11
-rw-r--r--youtube_dl/extractor/nationalgeographic.py44
-rw-r--r--youtube_dl/extractor/naver.py24
-rw-r--r--youtube_dl/extractor/nba.py20
-rw-r--r--youtube_dl/extractor/nbc.py57
-rw-r--r--youtube_dl/extractor/ndr.py102
-rw-r--r--youtube_dl/extractor/neteasemusic.py459
-rw-r--r--youtube_dl/extractor/netzkino.py2
-rw-r--r--youtube_dl/extractor/newstube.py2
-rw-r--r--youtube_dl/extractor/nextmedia.py43
-rw-r--r--youtube_dl/extractor/nfl.py6
-rw-r--r--youtube_dl/extractor/nhl.py39
-rw-r--r--youtube_dl/extractor/niconico.py3
-rw-r--r--youtube_dl/extractor/noco.py142
-rw-r--r--youtube_dl/extractor/nova.py179
-rw-r--r--youtube_dl/extractor/nowtv.py193
-rw-r--r--youtube_dl/extractor/npo.py163
-rw-r--r--youtube_dl/extractor/nrk.py37
-rw-r--r--youtube_dl/extractor/nytimes.py82
-rw-r--r--youtube_dl/extractor/odnoklassniki.py43
-rw-r--r--youtube_dl/extractor/onionstudios.py76
-rw-r--r--youtube_dl/extractor/ooyala.py183
-rw-r--r--youtube_dl/extractor/openfilm.py4
-rw-r--r--youtube_dl/extractor/patreon.py2
-rw-r--r--youtube_dl/extractor/pbs.py76
-rw-r--r--youtube_dl/extractor/periscope.py99
-rw-r--r--youtube_dl/extractor/photobucket.py4
-rw-r--r--youtube_dl/extractor/pinkbike.py96
-rw-r--r--youtube_dl/extractor/planetaplay.py3
-rw-r--r--youtube_dl/extractor/played.py4
-rw-r--r--youtube_dl/extractor/playtvak.py181
-rw-r--r--youtube_dl/extractor/playvid.py7
-rw-r--r--youtube_dl/extractor/pluralsight.py207
-rw-r--r--youtube_dl/extractor/porn91.py73
-rw-r--r--youtube_dl/extractor/pornhub.py31
-rw-r--r--youtube_dl/extractor/pornovoisines.py4
-rw-r--r--youtube_dl/extractor/primesharetv.py9
-rw-r--r--youtube_dl/extractor/promptfile.py5
-rw-r--r--youtube_dl/extractor/prosiebensat1.py43
-rw-r--r--youtube_dl/extractor/qqmusic.py185
-rw-r--r--youtube_dl/extractor/quickvid.py1
-rw-r--r--youtube_dl/extractor/rds.py73
-rw-r--r--youtube_dl/extractor/rtbf.py50
-rw-r--r--youtube_dl/extractor/rtl2.py27
-rw-r--r--youtube_dl/extractor/rtlnl.py58
-rw-r--r--youtube_dl/extractor/rtlnow.py174
-rw-r--r--youtube_dl/extractor/rtp.py4
-rw-r--r--youtube_dl/extractor/rts.py32
-rw-r--r--youtube_dl/extractor/rtve.py2
-rw-r--r--youtube_dl/extractor/rtvnh.py47
-rw-r--r--youtube_dl/extractor/rutube.py1
-rw-r--r--youtube_dl/extractor/rutv.py19
-rw-r--r--youtube_dl/extractor/ruutu.py119
-rw-r--r--youtube_dl/extractor/safari.py13
-rw-r--r--youtube_dl/extractor/sbs.py44
-rw-r--r--youtube_dl/extractor/screenwavemedia.py118
-rw-r--r--youtube_dl/extractor/senateisvp.py8
-rw-r--r--youtube_dl/extractor/sexykarma.py1
-rw-r--r--youtube_dl/extractor/shahid.py107
-rw-r--r--youtube_dl/extractor/shared.py23
-rw-r--r--youtube_dl/extractor/smotri.py60
-rw-r--r--youtube_dl/extractor/snagfilms.py181
-rw-r--r--youtube_dl/extractor/sockshare.py83
-rw-r--r--youtube_dl/extractor/sohu.py58
-rw-r--r--youtube_dl/extractor/soompi.py146
-rw-r--r--youtube_dl/extractor/soundcloud.py140
-rw-r--r--youtube_dl/extractor/southpark.py32
-rw-r--r--youtube_dl/extractor/spankwire.py70
-rw-r--r--youtube_dl/extractor/spiegel.py5
-rw-r--r--youtube_dl/extractor/spiegeltv.py52
-rw-r--r--youtube_dl/extractor/sportbox.py131
-rw-r--r--youtube_dl/extractor/sportdeutschland.py10
-rw-r--r--youtube_dl/extractor/sunporno.py2
-rw-r--r--youtube_dl/extractor/svt.py (renamed from youtube_dl/extractor/svtplay.py)105
-rw-r--r--youtube_dl/extractor/tagesschau.py75
-rw-r--r--youtube_dl/extractor/teamcoco.py87
-rw-r--r--youtube_dl/extractor/telecinco.py9
-rw-r--r--youtube_dl/extractor/telegraaf.py35
-rw-r--r--youtube_dl/extractor/tenplay.py27
-rw-r--r--youtube_dl/extractor/tf1.py12
-rw-r--r--youtube_dl/extractor/theplatform.py283
-rw-r--r--youtube_dl/extractor/thesixtyone.py18
-rw-r--r--youtube_dl/extractor/thisamericanlife.py40
-rw-r--r--youtube_dl/extractor/tlc.py15
-rw-r--r--youtube_dl/extractor/tmz.py28
-rw-r--r--youtube_dl/extractor/tnaflix.py271
-rw-r--r--youtube_dl/extractor/tube8.py14
-rw-r--r--youtube_dl/extractor/tubitv.py82
-rw-r--r--youtube_dl/extractor/tudou.py9
-rw-r--r--youtube_dl/extractor/tumblr.py32
-rw-r--r--youtube_dl/extractor/turbo.py4
-rw-r--r--youtube_dl/extractor/tutv.py2
-rw-r--r--youtube_dl/extractor/tv2.py126
-rw-r--r--youtube_dl/extractor/tvc.py109
-rw-r--r--youtube_dl/extractor/tvigle.py39
-rw-r--r--youtube_dl/extractor/tvplay.py18
-rw-r--r--youtube_dl/extractor/tweakers.py50
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py4
-rw-r--r--youtube_dl/extractor/twitch.py129
-rw-r--r--youtube_dl/extractor/twitter.py72
-rw-r--r--youtube_dl/extractor/udemy.py44
-rw-r--r--youtube_dl/extractor/udn.py1
-rw-r--r--youtube_dl/extractor/ultimedia.py10
-rw-r--r--youtube_dl/extractor/vbox7.py21
-rw-r--r--youtube_dl/extractor/veehd.py3
-rw-r--r--youtube_dl/extractor/vessel.py12
-rw-r--r--youtube_dl/extractor/vgtv.py114
-rw-r--r--youtube_dl/extractor/vice.py38
-rw-r--r--youtube_dl/extractor/videobam.py81
-rw-r--r--youtube_dl/extractor/videolecturesnet.py2
-rw-r--r--youtube_dl/extractor/videomega.py28
-rw-r--r--youtube_dl/extractor/videott.py2
-rw-r--r--youtube_dl/extractor/vidme.py46
-rw-r--r--youtube_dl/extractor/vier.py9
-rw-r--r--youtube_dl/extractor/viewster.py223
-rw-r--r--youtube_dl/extractor/viki.py379
-rw-r--r--youtube_dl/extractor/vimeo.py62
-rw-r--r--youtube_dl/extractor/vimple.py46
-rw-r--r--youtube_dl/extractor/vine.py2
-rw-r--r--youtube_dl/extractor/vk.py166
-rw-r--r--youtube_dl/extractor/vodlocker.py9
-rw-r--r--youtube_dl/extractor/voicerepublic.py99
-rw-r--r--youtube_dl/extractor/vube.py1
-rw-r--r--youtube_dl/extractor/vuclip.py2
-rw-r--r--youtube_dl/extractor/vulture.py2
-rw-r--r--youtube_dl/extractor/webofstories.py41
-rw-r--r--youtube_dl/extractor/wimp.py3
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py23
-rw-r--r--youtube_dl/extractor/xbef.py6
-rw-r--r--youtube_dl/extractor/xhamster.py69
-rw-r--r--youtube_dl/extractor/xminus.py4
-rw-r--r--youtube_dl/extractor/xnxx.py6
-rw-r--r--youtube_dl/extractor/xstream.py115
-rw-r--r--youtube_dl/extractor/xtube.py6
-rw-r--r--youtube_dl/extractor/xuite.py1
-rw-r--r--youtube_dl/extractor/xvideos.py31
-rw-r--r--youtube_dl/extractor/yahoo.py18
-rw-r--r--youtube_dl/extractor/yam.py27
-rw-r--r--youtube_dl/extractor/yandexmusic.py106
-rw-r--r--youtube_dl/extractor/yinyuetai.py56
-rw-r--r--youtube_dl/extractor/ynet.py4
-rw-r--r--youtube_dl/extractor/youku.py307
-rw-r--r--youtube_dl/extractor/youtube.py726
-rw-r--r--youtube_dl/extractor/zingmp3.py10
-rw-r--r--youtube_dl/options.py25
-rw-r--r--youtube_dl/postprocessor/common.py10
-rw-r--r--youtube_dl/postprocessor/embedthumbnail.py40
-rw-r--r--youtube_dl/postprocessor/execafterdownload.py4
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py226
-rw-r--r--youtube_dl/postprocessor/xattrpp.py74
-rw-r--r--youtube_dl/update.py2
-rw-r--r--youtube_dl/utils.py570
-rw-r--r--youtube_dl/version.py2
278 files changed, 13770 insertions, 4135 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index eb7470f72..982e658ce 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -21,24 +21,24 @@ import subprocess
import socket
import sys
import time
+import tokenize
import traceback
if os.name == 'nt':
import ctypes
from .compat import (
- compat_basestring,
compat_cookiejar,
compat_expanduser,
compat_get_terminal_size,
compat_http_client,
compat_kwargs,
compat_str,
+ compat_tokenize_tokenize,
compat_urllib_error,
compat_urllib_request,
)
from .utils import (
- escape_url,
ContentTooShortError,
date_from_str,
DateRange,
@@ -118,7 +118,7 @@ class YoutubeDL(object):
username: Username for authentication purposes.
password: Password for authentication purposes.
- videopassword: Password for acces a video.
+ videopassword: Password for accessing a video.
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
@@ -138,6 +138,7 @@ class YoutubeDL(object):
outtmpl: Template for output names.
restrictfilenames: Do not allow "&" and spaces in file names
ignoreerrors: Do not stop on download errors.
+ force_generic_extractor: Force downloader to use the generic extractor
nooverwrites: Prevent overwriting files.
playliststart: Playlist item to start at.
playlistend: Playlist item to end at.
@@ -260,7 +261,8 @@ class YoutubeDL(object):
The following options are used by the post processors:
prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
otherwise prefer avconv.
- exec_cmd: Arbitrary command to run after downloading
+ postprocessor_args: A list of additional command-line arguments for the
+ postprocessor.
"""
params = None
@@ -626,13 +628,16 @@ class YoutubeDL(object):
info_dict.setdefault(key, value)
def extract_info(self, url, download=True, ie_key=None, extra_info={},
- process=True):
+ process=True, force_generic_extractor=False):
'''
Returns a list with a dictionary for each video we find.
If 'download', also downloads the videos.
extra_info is a dict containing the extra values to add to each result
'''
+ if not ie_key and force_generic_extractor:
+ ie_key = 'Generic'
+
if ie_key:
ies = [self.get_info_extractor(ie_key)]
else:
@@ -760,7 +765,9 @@ class YoutubeDL(object):
if isinstance(ie_entries, list):
n_all_entries = len(ie_entries)
if playlistitems:
- entries = [ie_entries[i - 1] for i in playlistitems]
+ entries = [
+ ie_entries[i - 1] for i in playlistitems
+ if -n_all_entries <= i - 1 < n_all_entries]
else:
entries = ie_entries[playliststart:playlistend]
n_entries = len(entries)
@@ -845,8 +852,8 @@ class YoutubeDL(object):
else:
raise Exception('Invalid result type: %s' % result_type)
- def _apply_format_filter(self, format_spec, available_formats):
- " Returns a tuple of the remaining format_spec and filtered formats "
+ def _build_format_filter(self, filter_spec):
+ " Returns a function to filter the formats according to the filter_spec "
OPERATORS = {
'<': operator.lt,
@@ -856,13 +863,13 @@ class YoutubeDL(object):
'=': operator.eq,
'!=': operator.ne,
}
- operator_rex = re.compile(r'''(?x)\s*\[
+ operator_rex = re.compile(r'''(?x)\s*
(?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
- \]$
+ $
''' % '|'.join(map(re.escape, OPERATORS.keys())))
- m = operator_rex.search(format_spec)
+ m = operator_rex.search(filter_spec)
if m:
try:
comparison_value = int(m.group('value'))
@@ -873,7 +880,7 @@ class YoutubeDL(object):
if comparison_value is None:
raise ValueError(
'Invalid value %r in format specification %r' % (
- m.group('value'), format_spec))
+ m.group('value'), filter_spec))
op = OPERATORS[m.group('op')]
if not m:
@@ -881,84 +888,283 @@ class YoutubeDL(object):
'=': operator.eq,
'!=': operator.ne,
}
- str_operator_rex = re.compile(r'''(?x)\s*\[
+ str_operator_rex = re.compile(r'''(?x)
\s*(?P<key>ext|acodec|vcodec|container|protocol)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<value>[a-zA-Z0-9_-]+)
- \s*\]$
+ \s*$
''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
- m = str_operator_rex.search(format_spec)
+ m = str_operator_rex.search(filter_spec)
if m:
comparison_value = m.group('value')
op = STR_OPERATORS[m.group('op')]
if not m:
- raise ValueError('Invalid format specification %r' % format_spec)
+ raise ValueError('Invalid filter specification %r' % filter_spec)
def _filter(f):
actual_value = f.get(m.group('key'))
if actual_value is None:
return m.group('none_inclusive')
return op(actual_value, comparison_value)
- new_formats = [f for f in available_formats if _filter(f)]
+ return _filter
+
+ def build_format_selector(self, format_spec):
+ def syntax_error(note, start):
+ message = (
+ 'Invalid format specification: '
+ '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
+ return SyntaxError(message)
+
+ PICKFIRST = 'PICKFIRST'
+ MERGE = 'MERGE'
+ SINGLE = 'SINGLE'
+ GROUP = 'GROUP'
+ FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
+
+ def _parse_filter(tokens):
+ filter_parts = []
+ for type, string, start, _, _ in tokens:
+ if type == tokenize.OP and string == ']':
+ return ''.join(filter_parts)
+ else:
+ filter_parts.append(string)
+
+ def _remove_unused_ops(tokens):
+ # Remove operators that we don't use and join them with the sourrounding strings
+ # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
+ ALLOWED_OPS = ('/', '+', ',', '(', ')')
+ last_string, last_start, last_end, last_line = None, None, None, None
+ for type, string, start, end, line in tokens:
+ if type == tokenize.OP and string == '[':
+ if last_string:
+ yield tokenize.NAME, last_string, last_start, last_end, last_line
+ last_string = None
+ yield type, string, start, end, line
+ # everything inside brackets will be handled by _parse_filter
+ for type, string, start, end, line in tokens:
+ yield type, string, start, end, line
+ if type == tokenize.OP and string == ']':
+ break
+ elif type == tokenize.OP and string in ALLOWED_OPS:
+ if last_string:
+ yield tokenize.NAME, last_string, last_start, last_end, last_line
+ last_string = None
+ yield type, string, start, end, line
+ elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
+ if not last_string:
+ last_string = string
+ last_start = start
+ last_end = end
+ else:
+ last_string += string
+ if last_string:
+ yield tokenize.NAME, last_string, last_start, last_end, last_line
+
+ def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
+ selectors = []
+ current_selector = None
+ for type, string, start, _, _ in tokens:
+ # ENCODING is only defined in python 3.x
+ if type == getattr(tokenize, 'ENCODING', None):
+ continue
+ elif type in [tokenize.NAME, tokenize.NUMBER]:
+ current_selector = FormatSelector(SINGLE, string, [])
+ elif type == tokenize.OP:
+ if string == ')':
+ if not inside_group:
+ # ')' will be handled by the parentheses group
+ tokens.restore_last_token()
+ break
+ elif inside_merge and string in ['/', ',']:
+ tokens.restore_last_token()
+ break
+ elif inside_choice and string == ',':
+ tokens.restore_last_token()
+ break
+ elif string == ',':
+ if not current_selector:
+ raise syntax_error('"," must follow a format selector', start)
+ selectors.append(current_selector)
+ current_selector = None
+ elif string == '/':
+ if not current_selector:
+ raise syntax_error('"/" must follow a format selector', start)
+ first_choice = current_selector
+ second_choice = _parse_format_selection(tokens, inside_choice=True)
+ current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
+ elif string == '[':
+ if not current_selector:
+ current_selector = FormatSelector(SINGLE, 'best', [])
+ format_filter = _parse_filter(tokens)
+ current_selector.filters.append(format_filter)
+ elif string == '(':
+ if current_selector:
+ raise syntax_error('Unexpected "("', start)
+ group = _parse_format_selection(tokens, inside_group=True)
+ current_selector = FormatSelector(GROUP, group, [])
+ elif string == '+':
+ video_selector = current_selector
+ audio_selector = _parse_format_selection(tokens, inside_merge=True)
+ if not video_selector or not audio_selector:
+ raise syntax_error('"+" must be between two format selectors', start)
+ current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
+ else:
+ raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
+ elif type == tokenize.ENDMARKER:
+ break
+ if current_selector:
+ selectors.append(current_selector)
+ return selectors
+
+ def _build_selector_function(selector):
+ if isinstance(selector, list):
+ fs = [_build_selector_function(s) for s in selector]
+
+ def selector_function(formats):
+ for f in fs:
+ for format in f(formats):
+ yield format
+ return selector_function
+ elif selector.type == GROUP:
+ selector_function = _build_selector_function(selector.selector)
+ elif selector.type == PICKFIRST:
+ fs = [_build_selector_function(s) for s in selector.selector]
+
+ def selector_function(formats):
+ for f in fs:
+ picked_formats = list(f(formats))
+ if picked_formats:
+ return picked_formats
+ return []
+ elif selector.type == SINGLE:
+ format_spec = selector.selector
+
+ def selector_function(formats):
+ formats = list(formats)
+ if not formats:
+ return
+ if format_spec == 'all':
+ for f in formats:
+ yield f
+ elif format_spec in ['best', 'worst', None]:
+ format_idx = 0 if format_spec == 'worst' else -1
+ audiovideo_formats = [
+ f for f in formats
+ if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
+ if audiovideo_formats:
+ yield audiovideo_formats[format_idx]
+ # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+ elif (all(f.get('acodec') != 'none' for f in formats) or
+ all(f.get('vcodec') != 'none' for f in formats)):
+ yield formats[format_idx]
+ elif format_spec == 'bestaudio':
+ audio_formats = [
+ f for f in formats
+ if f.get('vcodec') == 'none']
+ if audio_formats:
+ yield audio_formats[-1]
+ elif format_spec == 'worstaudio':
+ audio_formats = [
+ f for f in formats
+ if f.get('vcodec') == 'none']
+ if audio_formats:
+ yield audio_formats[0]
+ elif format_spec == 'bestvideo':
+ video_formats = [
+ f for f in formats
+ if f.get('acodec') == 'none']
+ if video_formats:
+ yield video_formats[-1]
+ elif format_spec == 'worstvideo':
+ video_formats = [
+ f for f in formats
+ if f.get('acodec') == 'none']
+ if video_formats:
+ yield video_formats[0]
+ else:
+ extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
+ if format_spec in extensions:
+ filter_f = lambda f: f['ext'] == format_spec
+ else:
+ filter_f = lambda f: f['format_id'] == format_spec
+ matches = list(filter(filter_f, formats))
+ if matches:
+ yield matches[-1]
+ elif selector.type == MERGE:
+ def _merge(formats_info):
+ format_1, format_2 = [f['format_id'] for f in formats_info]
+ # The first format must contain the video and the
+ # second the audio
+ if formats_info[0].get('vcodec') == 'none':
+ self.report_error('The first format must '
+ 'contain the video, try using '
+ '"-f %s+%s"' % (format_2, format_1))
+ return
+ output_ext = (
+ formats_info[0]['ext']
+ if self.params.get('merge_output_format') is None
+ else self.params['merge_output_format'])
+ return {
+ 'requested_formats': formats_info,
+ 'format': '%s+%s' % (formats_info[0].get('format'),
+ formats_info[1].get('format')),
+ 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
+ formats_info[1].get('format_id')),
+ 'width': formats_info[0].get('width'),
+ 'height': formats_info[0].get('height'),
+ 'resolution': formats_info[0].get('resolution'),
+ 'fps': formats_info[0].get('fps'),
+ 'vcodec': formats_info[0].get('vcodec'),
+ 'vbr': formats_info[0].get('vbr'),
+ 'stretched_ratio': formats_info[0].get('stretched_ratio'),
+ 'acodec': formats_info[1].get('acodec'),
+ 'abr': formats_info[1].get('abr'),
+ 'ext': output_ext,
+ }
+ video_selector, audio_selector = map(_build_selector_function, selector.selector)
- new_format_spec = format_spec[:-len(m.group(0))]
- if not new_format_spec:
- new_format_spec = 'best'
+ def selector_function(formats):
+ formats = list(formats)
+ for pair in itertools.product(video_selector(formats), audio_selector(formats)):
+ yield _merge(pair)
- return (new_format_spec, new_formats)
+ filters = [self._build_format_filter(f) for f in selector.filters]
- def select_format(self, format_spec, available_formats):
- while format_spec.endswith(']'):
- format_spec, available_formats = self._apply_format_filter(
- format_spec, available_formats)
- if not available_formats:
- return None
+ def final_selector(formats):
+ for _filter in filters:
+ formats = list(filter(_filter, formats))
+ return selector_function(formats)
+ return final_selector
- if format_spec in ['best', 'worst', None]:
- format_idx = 0 if format_spec == 'worst' else -1
- audiovideo_formats = [
- f for f in available_formats
- if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
- if audiovideo_formats:
- return audiovideo_formats[format_idx]
- # for audio only urls, select the best/worst audio format
- elif all(f.get('acodec') != 'none' for f in available_formats):
- return available_formats[format_idx]
- elif format_spec == 'bestaudio':
- audio_formats = [
- f for f in available_formats
- if f.get('vcodec') == 'none']
- if audio_formats:
- return audio_formats[-1]
- elif format_spec == 'worstaudio':
- audio_formats = [
- f for f in available_formats
- if f.get('vcodec') == 'none']
- if audio_formats:
- return audio_formats[0]
- elif format_spec == 'bestvideo':
- video_formats = [
- f for f in available_formats
- if f.get('acodec') == 'none']
- if video_formats:
- return video_formats[-1]
- elif format_spec == 'worstvideo':
- video_formats = [
- f for f in available_formats
- if f.get('acodec') == 'none']
- if video_formats:
- return video_formats[0]
- else:
- extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
- if format_spec in extensions:
- filter_f = lambda f: f['ext'] == format_spec
- else:
- filter_f = lambda f: f['format_id'] == format_spec
- matches = list(filter(filter_f, available_formats))
- if matches:
- return matches[-1]
- return None
+ stream = io.BytesIO(format_spec.encode('utf-8'))
+ try:
+ tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
+ except tokenize.TokenError:
+ raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
+
+ class TokenIterator(object):
+ def __init__(self, tokens):
+ self.tokens = tokens
+ self.counter = 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if self.counter >= len(self.tokens):
+ raise StopIteration()
+ value = self.tokens[self.counter]
+ self.counter += 1
+ return value
+
+ next = __next__
+
+ def restore_last_token(self):
+ self.counter -= 1
+
+ parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
+ return _build_selector_function(parsed_selector)
def _calc_headers(self, info_dict):
res = std_headers.copy()
@@ -1001,7 +1207,7 @@ class YoutubeDL(object):
t.get('preference'), t.get('width'), t.get('height'),
t.get('id'), t.get('url')))
for i, t in enumerate(thumbnails):
- if 'width' in t and 'height' in t:
+ if t.get('width') and t.get('height'):
t['resolution'] = '%dx%d' % (t['width'], t['height'])
if t.get('id') is None:
t['id'] = '%d' % i
@@ -1013,13 +1219,13 @@ class YoutubeDL(object):
info_dict['display_id'] = info_dict['id']
if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
- # Working around negative timestamps in Windows
- # (see http://bugs.python.org/issue1646728)
- if info_dict['timestamp'] < 0 and os.name == 'nt':
- info_dict['timestamp'] = 0
- upload_date = datetime.datetime.utcfromtimestamp(
- info_dict['timestamp'])
- info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+ # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+ # see http://bugs.python.org/issue1646728)
+ try:
+ upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+ info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+ except (ValueError, OverflowError, OSError):
+ pass
if self.params.get('listsubtitles', False):
if 'automatic_captions' in info_dict:
@@ -1030,12 +1236,6 @@ class YoutubeDL(object):
info_dict['id'], info_dict.get('subtitles'),
info_dict.get('automatic_captions'))
- # This extractors handle format selection themselves
- if info_dict['extractor'] in ['Youku']:
- if download:
- self.process_info(info_dict)
- return info_dict
-
# We now pick which formats have to be downloaded
if info_dict.get('formats') is None:
# There's only one format available
@@ -1046,6 +1246,8 @@ class YoutubeDL(object):
if not formats:
raise ExtractorError('No video formats found!')
+ formats_dict = {}
+
# We check that all the formats have the format and format_id fields
for i, format in enumerate(formats):
if 'url' not in format:
@@ -1053,6 +1255,18 @@ class YoutubeDL(object):
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
+ format_id = format['format_id']
+ if format_id not in formats_dict:
+ formats_dict[format_id] = []
+ formats_dict[format_id].append(format)
+
+ # Make sure all formats have unique format_id
+ for format_id, ambiguous_formats in formats_dict.items():
+ if len(ambiguous_formats) > 1:
+ for i, format in enumerate(ambiguous_formats):
+ format['format_id'] = '%s-%d' % (format_id, i)
+
+ for i, format in enumerate(formats):
if format.get('format') is None:
format['format'] = '{id} - {res}{note}'.format(
id=format['format_id'],
@@ -1086,60 +1300,16 @@ class YoutubeDL(object):
req_format = self.params.get('format')
if req_format is None:
req_format_list = []
- if info_dict['extractor'] in ['youtube', 'ted'] and FFmpegMergerPP(self).available:
- req_format_list.append('bestvideo+bestaudio')
+ if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
+ info_dict['extractor'] in ['youtube', 'ted'] and
+ not info_dict.get('is_live')):
+ merger = FFmpegMergerPP(self)
+ if merger.available and merger.can_merge():
+ req_format_list.append('bestvideo+bestaudio')
req_format_list.append('best')
req_format = '/'.join(req_format_list)
- formats_to_download = []
- if req_format == 'all':
- formats_to_download = formats
- else:
- for rfstr in req_format.split(','):
- # We can accept formats requested in the format: 34/5/best, we pick
- # the first that is available, starting from left
- req_formats = rfstr.split('/')
- for rf in req_formats:
- if re.match(r'.+?\+.+?', rf) is not None:
- # Two formats have been requested like '137+139'
- format_1, format_2 = rf.split('+')
- formats_info = (self.select_format(format_1, formats),
- self.select_format(format_2, formats))
- if all(formats_info):
- # The first format must contain the video and the
- # second the audio
- if formats_info[0].get('vcodec') == 'none':
- self.report_error('The first format must '
- 'contain the video, try using '
- '"-f %s+%s"' % (format_2, format_1))
- return
- output_ext = (
- formats_info[0]['ext']
- if self.params.get('merge_output_format') is None
- else self.params['merge_output_format'])
- selected_format = {
- 'requested_formats': formats_info,
- 'format': '%s+%s' % (formats_info[0].get('format'),
- formats_info[1].get('format')),
- 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
- formats_info[1].get('format_id')),
- 'width': formats_info[0].get('width'),
- 'height': formats_info[0].get('height'),
- 'resolution': formats_info[0].get('resolution'),
- 'fps': formats_info[0].get('fps'),
- 'vcodec': formats_info[0].get('vcodec'),
- 'vbr': formats_info[0].get('vbr'),
- 'stretched_ratio': formats_info[0].get('stretched_ratio'),
- 'acodec': formats_info[1].get('acodec'),
- 'abr': formats_info[1].get('abr'),
- 'ext': output_ext,
- }
- else:
- selected_format = None
- else:
- selected_format = self.select_format(rf, formats)
- if selected_format is not None:
- formats_to_download.append(selected_format)
- break
+ format_selector = self.build_format_selector(req_format)
+ formats_to_download = list(format_selector(formats))
if not formats_to_download:
raise ExtractorError('requested format not available',
expected=True)
@@ -1364,7 +1534,7 @@ class YoutubeDL(object):
postprocessors = []
self.report_warning('You have requested multiple '
'formats but ffmpeg or avconv are not installed.'
- ' The formats won\'t be merged')
+ ' The formats won\'t be merged.')
else:
postprocessors = [merger]
@@ -1391,8 +1561,8 @@ class YoutubeDL(object):
requested_formats = info_dict['requested_formats']
if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
info_dict['ext'] = 'mkv'
- self.report_warning('You have requested formats incompatible for merge. '
- 'The formats will be merged into mkv')
+ self.report_warning(
+ 'Requested formats are incompatible for merge and will be merged into mkv.')
# Ensure filename always has a correct extension for successful merge
filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
if os.path.exists(encodeFilename(filename)):
@@ -1479,7 +1649,8 @@ class YoutubeDL(object):
for url in url_list:
try:
# It also downloads the videos
- res = self.extract_info(url)
+ res = self.extract_info(
+ url, force_generic_extractor=self.params.get('force_generic_extractor', False))
except UnavailableVideoError:
self.report_error('unable to download video')
except MaxDownloadsReached:
@@ -1523,6 +1694,7 @@ class YoutubeDL(object):
pps_chain.extend(ie_info['__postprocessors'])
pps_chain.extend(self._pps)
for pp in pps_chain:
+ files_to_delete = []
try:
files_to_delete, info = pp.run(info)
except PostProcessingError as e:
@@ -1685,26 +1857,6 @@ class YoutubeDL(object):
def urlopen(self, req):
""" Start an HTTP download """
-
- # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
- # always respected by websites, some tend to give out URLs with non percent-encoded
- # non-ASCII characters (see telemb.py, ard.py [#3412])
- # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
- # To work around aforementioned issue we will replace request's original URL with
- # percent-encoded one
- req_is_string = isinstance(req, compat_basestring)
- url = req if req_is_string else req.get_full_url()
- url_escaped = escape_url(url)
-
- # Substitute URL if any change after escaping
- if url != url_escaped:
- if req_is_string:
- req = url_escaped
- else:
- req = compat_urllib_request.Request(
- url_escaped, data=req.data, headers=req.headers,
- origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
-
return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):
@@ -1847,7 +1999,7 @@ class YoutubeDL(object):
thumb_ext = determine_ext(t['url'], 'jpg')
suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
- thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+ t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
self.to_screen('[%s] %s: Thumbnail %sis already present' %
@@ -1857,7 +2009,7 @@ class YoutubeDL(object):
(info_dict['extractor'], info_dict['id'], thumb_display_id))
try:
uf = self.urlopen(t['url'])
- with open(thumb_filename, 'wb') as thumbf:
+ with open(encodeFilename(thumb_filename), 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
(info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index c88489f29..55b22c889 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -169,7 +169,7 @@ def _real_main(argv=None):
if not opts.audioquality.isdigit():
parser.error('invalid audio quality specified')
if opts.recodevideo is not None:
- if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
+ if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']:
parser.error('invalid video recode format specified')
if opts.convertsubtitles is not None:
if opts.convertsubtitles not in ['srt', 'vtt', 'ass']:
@@ -240,13 +240,18 @@ def _real_main(argv=None):
if opts.xattrs:
postprocessors.append({'key': 'XAttrMetadata'})
if opts.embedthumbnail:
- postprocessors.append({'key': 'EmbedThumbnail'})
+ already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails
+ postprocessors.append({
+ 'key': 'EmbedThumbnail',
+ 'already_have_thumbnail': already_have_thumbnail
+ })
+ if not already_have_thumbnail:
+ opts.writethumbnail = True
# Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
# So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
if opts.exec_cmd:
postprocessors.append({
'key': 'ExecAfterDownload',
- 'verboseOutput': opts.verbose,
'exec_cmd': opts.exec_cmd,
})
if opts.xattr_set_filesize:
@@ -258,6 +263,9 @@ def _real_main(argv=None):
external_downloader_args = None
if opts.external_downloader_args:
external_downloader_args = shlex.split(opts.external_downloader_args)
+ postprocessor_args = None
+ if opts.postprocessor_args:
+ postprocessor_args = shlex.split(opts.postprocessor_args)
match_filter = (
None if opts.match_filter is None
else match_filter_func(opts.match_filter))
@@ -288,6 +296,7 @@ def _real_main(argv=None):
'autonumber_size': opts.autonumber_size,
'restrictfilenames': opts.restrictfilenames,
'ignoreerrors': opts.ignoreerrors,
+ 'force_generic_extractor': opts.force_generic_extractor,
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
'retries': opts_retries,
@@ -345,7 +354,6 @@ def _real_main(argv=None):
'default_search': opts.default_search,
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
'encoding': opts.encoding,
- 'exec_cmd': opts.exec_cmd,
'extract_flat': opts.extract_flat,
'merge_output_format': opts.merge_output_format,
'postprocessors': postprocessors,
@@ -362,6 +370,7 @@ def _real_main(argv=None):
'ffmpeg_location': opts.ffmpeg_location,
'hls_prefer_native': opts.hls_prefer_native,
'external_downloader_args': external_downloader_args,
+ 'postprocessor_args': postprocessor_args,
'cn_verification_proxy': opts.cn_verification_proxy,
}
diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py
index 07224d508..7817adcfd 100644
--- a/youtube_dl/aes.py
+++ b/youtube_dl/aes.py
@@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):
"""
NONCE_LENGTH_BYTES = 8
- data = bytes_to_intlist(base64.b64decode(data))
+ data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
password = bytes_to_intlist(password.encode('utf-8'))
key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index f9529210d..ace5bd716 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -9,6 +9,7 @@ import shutil
import socket
import subprocess
import sys
+import itertools
try:
@@ -42,6 +43,11 @@ except ImportError: # Python 2
import cookielib as compat_cookiejar
try:
+ import http.cookies as compat_cookies
+except ImportError: # Python 2
+ import Cookie as compat_cookies
+
+try:
import html.entities as compat_html_entities
except ImportError: # Python 2
import htmlentitydefs as compat_html_entities
@@ -74,42 +80,74 @@ except ImportError:
import BaseHTTPServer as compat_http_server
try:
+ from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
from urllib.parse import unquote as compat_urllib_parse_unquote
-except ImportError:
- def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
- if string == '':
+ from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
+except ImportError: # Python 2
+ _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
+ else re.compile('([\x00-\x7f]+)'))
+
+ # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
+ # implementations from cpython 3.4.3's stdlib. Python 2's version
+ # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244)
+
+ def compat_urllib_parse_unquote_to_bytes(string):
+ """unquote_to_bytes('abc%20def') -> b'abc def'."""
+ # Note: strings are encoded as UTF-8. This is only an issue if it contains
+ # unescaped non-ASCII characters, which URIs should not.
+ if not string:
+ # Is it a string-like object?
+ string.split
+ return b''
+ if isinstance(string, unicode):
+ string = string.encode('utf-8')
+ bits = string.split(b'%')
+ if len(bits) == 1:
return string
- res = string.split('%')
- if len(res) == 1:
+ res = [bits[0]]
+ append = res.append
+ for item in bits[1:]:
+ try:
+ append(compat_urllib_parse._hextochr[item[:2]])
+ append(item[2:])
+ except KeyError:
+ append(b'%')
+ append(item)
+ return b''.join(res)
+
+ def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
+ """Replace %xx escapes by their single-character equivalent. The optional
+ encoding and errors parameters specify how to decode percent-encoded
+ sequences into Unicode characters, as accepted by the bytes.decode()
+ method.
+ By default, percent-encoded sequences are decoded with UTF-8, and invalid
+ sequences are replaced by a placeholder character.
+
+ unquote('abc%20def') -> 'abc def'.
+ """
+ if '%' not in string:
+ string.split
return string
if encoding is None:
encoding = 'utf-8'
if errors is None:
errors = 'replace'
- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
- pct_sequence = b''
- string = res[0]
- for item in res[1:]:
- try:
- if not item:
- raise ValueError
- pct_sequence += item[:2].decode('hex')
- rest = item[2:]
- if not rest:
- # This segment was just a single percent-encoded character.
- # May be part of a sequence of code units, so delay decoding.
- # (Stored in pct_sequence).
- continue
- except ValueError:
- rest = '%' + item
- # Encountered non-percent-encoded characters. Flush the current
- # pct_sequence.
- string += pct_sequence.decode(encoding, errors) + rest
- pct_sequence = b''
- if pct_sequence:
- # Flush the final pct_sequence
- string += pct_sequence.decode(encoding, errors)
- return string
+ bits = _asciire.split(string)
+ res = [bits[0]]
+ append = res.append
+ for i in range(1, len(bits), 2):
+ append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors))
+ append(bits[i + 1])
+ return ''.join(res)
+
+ def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'):
+ """Like unquote(), but also replace plus signs by spaces, as required for
+ unquoting HTML form values.
+
+ unquote_plus('%7e/abc+def') -> '~/abc def'
+ """
+ string = string.replace('+', ' ')
+ return compat_urllib_parse_unquote(string, encoding, errors)
try:
compat_str = unicode # Python 2
@@ -388,12 +426,27 @@ else:
pass
return _terminal_size(columns, lines)
+try:
+ itertools.count(start=0, step=1)
+ compat_itertools_count = itertools.count
+except TypeError: # Python 2.6
+ def compat_itertools_count(start=0, step=1):
+ n = start
+ while True:
+ yield n
+ n += step
+
+if sys.version_info >= (3, 0):
+ from tokenize import tokenize as compat_tokenize_tokenize
+else:
+ from tokenize import generate_tokens as compat_tokenize_tokenize
__all__ = [
'compat_HTTPError',
'compat_basestring',
'compat_chr',
'compat_cookiejar',
+ 'compat_cookies',
'compat_expanduser',
'compat_get_terminal_size',
'compat_getenv',
@@ -401,6 +454,7 @@ __all__ = [
'compat_html_entities',
'compat_http_client',
'compat_http_server',
+ 'compat_itertools_count',
'compat_kwargs',
'compat_ord',
'compat_parse_qs',
@@ -408,9 +462,12 @@ __all__ = [
'compat_socket_create_connection',
'compat_str',
'compat_subprocess_get_DEVNULL',
+ 'compat_tokenize_tokenize',
'compat_urllib_error',
'compat_urllib_parse',
'compat_urllib_parse_unquote',
+ 'compat_urllib_parse_unquote_plus',
+ 'compat_urllib_parse_unquote_to_bytes',
'compat_urllib_parse_urlparse',
'compat_urllib_request',
'compat_urlparse',
diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py
index f110830c4..dccc59212 100644
--- a/youtube_dl/downloader/__init__.py
+++ b/youtube_dl/downloader/__init__.py
@@ -8,6 +8,7 @@ from .hls import NativeHlsFD
from .http import HttpFD
from .rtsp import RtspFD
from .rtmp import RtmpFD
+from .dash import DashSegmentsFD
from ..utils import (
determine_protocol,
@@ -20,6 +21,7 @@ PROTOCOL_MAP = {
'mms': RtspFD,
'rtsp': RtspFD,
'f4m': F4mFD,
+ 'http_dash_segments': DashSegmentsFD,
}
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
new file mode 100644
index 000000000..8b6fa2753
--- /dev/null
+++ b/youtube_dl/downloader/dash.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import FileDownloader
+from ..compat import compat_urllib_request
+
+
+class DashSegmentsFD(FileDownloader):
+ """
+ Download segments in a DASH manifest
+ """
+ def real_download(self, filename, info_dict):
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+ base_url = info_dict['url']
+ segment_urls = info_dict['segment_urls']
+
+ is_test = self.params.get('test', False)
+ remaining_bytes = self._TEST_FILE_SIZE if is_test else None
+ byte_counter = 0
+
+ def append_url_to_file(outf, target_url, target_name, remaining_bytes=None):
+ self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name))
+ req = compat_urllib_request.Request(target_url)
+ if remaining_bytes is not None:
+ req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1))
+
+ data = self.ydl.urlopen(req).read()
+
+ if remaining_bytes is not None:
+ data = data[:remaining_bytes]
+
+ outf.write(data)
+ return len(data)
+
+ def combine_url(base_url, target_url):
+ if re.match(r'^https?://', target_url):
+ return target_url
+ return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
+
+ with open(tmpfilename, 'wb') as outf:
+ append_url_to_file(
+ outf, combine_url(base_url, info_dict['initialization_url']),
+ 'initialization segment')
+ for i, segment_url in enumerate(segment_urls):
+ segment_len = append_url_to_file(
+ outf, combine_url(base_url, segment_url),
+ 'segment %d / %d' % (i + 1, len(segment_urls)),
+ remaining_bytes)
+ byte_counter += segment_len
+ if remaining_bytes is not None:
+ remaining_bytes -= segment_len
+ if remaining_bytes <= 0:
+ break
+
+ self.try_rename(tmpfilename, filename)
+
+ self._hook_progress({
+ 'downloaded_bytes': byte_counter,
+ 'total_bytes': byte_counter,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+
+ return True
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py
index 7ca2d3143..6c310346c 100644
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@@ -45,11 +45,13 @@ class ExternalFD(FileDownloader):
def supports(cls, info_dict):
return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps')
- def _source_address(self, command_option):
- source_address = self.params.get('source_address')
- if source_address is None:
+ def _option(self, command_option, param):
+ param = self.params.get(param)
+ if param is None:
return []
- return [command_option, source_address]
+ if isinstance(param, bool):
+ return [command_option]
+ return [command_option, param]
def _configuration_args(self, default=[]):
ex_args = self.params.get('external_downloader_args')
@@ -77,7 +79,17 @@ class CurlFD(ExternalFD):
cmd = [self.exe, '--location', '-o', tmpfilename]
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
- cmd += self._source_address('--interface')
+ cmd += self._option('--interface', 'source_address')
+ cmd += self._configuration_args()
+ cmd += ['--', info_dict['url']]
+ return cmd
+
+
+class AxelFD(ExternalFD):
+ def _make_cmd(self, tmpfilename, info_dict):
+ cmd = [self.exe, '-o', tmpfilename]
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['-H', '%s: %s' % (key, val)]
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
return cmd
@@ -88,7 +100,9 @@ class WgetFD(ExternalFD):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
- cmd += self._source_address('--bind-address')
+ cmd += self._option('--bind-address', 'source_address')
+ cmd += self._option('--proxy', 'proxy')
+ cmd += self._option('--no-check-certificate', 'nocheckcertificate')
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
return cmd
@@ -105,10 +119,19 @@ class Aria2cFD(ExternalFD):
cmd += ['--out', os.path.basename(tmpfilename)]
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
- cmd += self._source_address('--interface')
+ cmd += self._option('--interface', 'source_address')
+ cmd += self._option('--all-proxy', 'proxy')
cmd += ['--', info_dict['url']]
return cmd
+
+class HttpieFD(ExternalFD):
+ def _make_cmd(self, tmpfilename, info_dict):
+ cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['%s:%s' % (key, val)]
+ return cmd
+
_BY_NAME = dict(
(klass.get_basename(), klass)
for name, klass in globals().items()
@@ -123,5 +146,6 @@ def list_external_downloaders():
def get_external_downloader(external_downloader):
""" Given the name of the executable, see whether we support the given
downloader . """
- bn = os.path.basename(external_downloader)
+ # Drop .exe extension on Windows
+ bn = os.path.splitext(os.path.basename(external_downloader))[0]
return _BY_NAME[bn]
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 3cb07e15f..f478fc03c 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -7,8 +7,7 @@ import os
import time
import xml.etree.ElementTree as etree
-from .common import FileDownloader
-from .http import HttpFD
+from .fragment import FragmentFD
from ..compat import (
compat_urlparse,
compat_urllib_error,
@@ -16,8 +15,6 @@ from ..compat import (
from ..utils import (
struct_pack,
struct_unpack,
- encodeFilename,
- sanitize_open,
xpath_text,
)
@@ -226,16 +223,13 @@ def _add_ns(prop):
return '{http://ns.adobe.com/f4m/1.0}%s' % prop
-class HttpQuietDownloader(HttpFD):
- def to_screen(self, *args, **kargs):
- pass
-
-
-class F4mFD(FileDownloader):
+class F4mFD(FragmentFD):
"""
A downloader for f4m manifests or AdobeHDS.
"""
+ FD_NAME = 'f4m'
+
def _get_unencrypted_media(self, doc):
media = doc.findall(_add_ns('media'))
if not media:
@@ -288,7 +282,7 @@ class F4mFD(FileDownloader):
def real_download(self, filename, info_dict):
man_url = info_dict['url']
requested_bitrate = info_dict.get('tbr')
- self.to_screen('[download] Downloading f4m manifest')
+ self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
manifest = self.ydl.urlopen(man_url).read()
doc = etree.fromstring(manifest)
@@ -320,67 +314,20 @@ class F4mFD(FileDownloader):
# For some akamai manifests we'll need to add a query to the fragment url
akamai_pv = xpath_text(doc, _add_ns('pv-2.0'))
- self.report_destination(filename)
- http_dl = HttpQuietDownloader(
- self.ydl,
- {
- 'continuedl': True,
- 'quiet': True,
- 'noprogress': True,
- 'ratelimit': self.params.get('ratelimit', None),
- 'test': self.params.get('test', False),
- }
- )
- tmpfilename = self.temp_name(filename)
- (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb')
+ ctx = {
+ 'filename': filename,
+ 'total_frags': total_frags,
+ }
+
+ self._prepare_frag_download(ctx)
+
+ dest_stream = ctx['dest_stream']
write_flv_header(dest_stream)
if not live:
write_metadata_tag(dest_stream, metadata)
- # This dict stores the download progress, it's updated by the progress
- # hook
- state = {
- 'status': 'downloading',
- 'downloaded_bytes': 0,
- 'frag_index': 0,
- 'frag_count': total_frags,
- 'filename': filename,
- 'tmpfilename': tmpfilename,
- }
- start = time.time()
-
- def frag_progress_hook(s):
- if s['status'] not in ('downloading', 'finished'):
- return
-
- frag_total_bytes = s.get('total_bytes', 0)
- if s['status'] == 'finished':
- state['downloaded_bytes'] += frag_total_bytes
- state['frag_index'] += 1
-
- estimated_size = (
- (state['downloaded_bytes'] + frag_total_bytes) /
- (state['frag_index'] + 1) * total_frags)
- time_now = time.time()
- state['total_bytes_estimate'] = estimated_size
- state['elapsed'] = time_now - start
-
- if s['status'] == 'finished':
- progress = self.calc_percent(state['frag_index'], total_frags)
- else:
- frag_downloaded_bytes = s['downloaded_bytes']
- frag_progress = self.calc_percent(frag_downloaded_bytes,
- frag_total_bytes)
- progress = self.calc_percent(state['frag_index'], total_frags)
- progress += frag_progress / float(total_frags)
-
- state['eta'] = self.calc_eta(
- start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes)
- state['speed'] = s.get('speed')
- self._hook_progress(state)
-
- http_dl.add_progress_hook(frag_progress_hook)
+ self._start_frag_download(ctx)
frags_filenames = []
while fragments_list:
@@ -391,9 +338,9 @@ class F4mFD(FileDownloader):
url += '?' + akamai_pv.strip(';')
if info_dict.get('extra_param_to_segment_url'):
url += info_dict.get('extra_param_to_segment_url')
- frag_filename = '%s-%s' % (tmpfilename, name)
+ frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
try:
- success = http_dl.download(frag_filename, {'url': url})
+ success = ctx['dl'].download(frag_filename, {'url': url})
if not success:
return False
(down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
@@ -426,20 +373,9 @@ class F4mFD(FileDownloader):
msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1))
self.report_warning(msg)
- dest_stream.close()
+ self._finish_frag_download(ctx)
- elapsed = time.time() - start
- self.try_rename(tmpfilename, filename)
for frag_file in frags_filenames:
os.remove(encodeFilename(frag_file))
- fsize = os.path.getsize(encodeFilename(filename))
- self._hook_progress({
- 'downloaded_bytes': fsize,
- 'total_bytes': fsize,
- 'filename': filename,
- 'status': 'finished',
- 'elapsed': elapsed,
- })
-
return True
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
new file mode 100644
index 000000000..5a64b29ee
--- /dev/null
+++ b/youtube_dl/downloader/fragment.py
@@ -0,0 +1,111 @@
+from __future__ import division, unicode_literals
+
+import os
+import time
+
+from .common import FileDownloader
+from .http import HttpFD
+from ..utils import (
+ encodeFilename,
+ sanitize_open,
+)
+
+
+class HttpQuietDownloader(HttpFD):
+ def to_screen(self, *args, **kargs):
+ pass
+
+
+class FragmentFD(FileDownloader):
+ """
+ A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests).
+ """
+
+ def _prepare_and_start_frag_download(self, ctx):
+ self._prepare_frag_download(ctx)
+ self._start_frag_download(ctx)
+
+ def _prepare_frag_download(self, ctx):
+ self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags']))
+ self.report_destination(ctx['filename'])
+ dl = HttpQuietDownloader(
+ self.ydl,
+ {
+ 'continuedl': True,
+ 'quiet': True,
+ 'noprogress': True,
+ 'ratelimit': self.params.get('ratelimit', None),
+ 'retries': self.params.get('retries', 0),
+ 'test': self.params.get('test', False),
+ }
+ )
+ tmpfilename = self.temp_name(ctx['filename'])
+ dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb')
+ ctx.update({
+ 'dl': dl,
+ 'dest_stream': dest_stream,
+ 'tmpfilename': tmpfilename,
+ })
+
+ def _start_frag_download(self, ctx):
+ total_frags = ctx['total_frags']
+ # This dict stores the download progress, it's updated by the progress
+ # hook
+ state = {
+ 'status': 'downloading',
+ 'downloaded_bytes': 0,
+ 'frag_index': 0,
+ 'frag_count': total_frags,
+ 'filename': ctx['filename'],
+ 'tmpfilename': ctx['tmpfilename'],
+ }
+ start = time.time()
+ ctx['started'] = start
+
+ def frag_progress_hook(s):
+ if s['status'] not in ('downloading', 'finished'):
+ return
+
+ frag_total_bytes = s.get('total_bytes', 0)
+ if s['status'] == 'finished':
+ state['downloaded_bytes'] += frag_total_bytes
+ state['frag_index'] += 1
+
+ estimated_size = (
+ (state['downloaded_bytes'] + frag_total_bytes) /
+ (state['frag_index'] + 1) * total_frags)
+ time_now = time.time()
+ state['total_bytes_estimate'] = estimated_size
+ state['elapsed'] = time_now - start
+
+ if s['status'] == 'finished':
+ progress = self.calc_percent(state['frag_index'], total_frags)
+ else:
+ frag_downloaded_bytes = s['downloaded_bytes']
+ frag_progress = self.calc_percent(frag_downloaded_bytes,
+ frag_total_bytes)
+ progress = self.calc_percent(state['frag_index'], total_frags)
+ progress += frag_progress / float(total_frags)
+
+ state['eta'] = self.calc_eta(
+ start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes)
+ state['speed'] = s.get('speed')
+ self._hook_progress(state)
+
+ ctx['dl'].add_progress_hook(frag_progress_hook)
+
+ return start
+
+ def _finish_frag_download(self, ctx):
+ ctx['dest_stream'].close()
+ elapsed = time.time() - ctx['started']
+ self.try_rename(ctx['tmpfilename'], ctx['filename'])
+ fsize = os.path.getsize(encodeFilename(ctx['filename']))
+
+ self._hook_progress({
+ 'downloaded_bytes': fsize,
+ 'total_bytes': fsize,
+ 'filename': ctx['filename'],
+ 'status': 'finished',
+ 'elapsed': elapsed,
+ })
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 8be4f4249..2b6c3370f 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -4,12 +4,11 @@ import os
import re
import subprocess
-from ..postprocessor.ffmpeg import FFmpegPostProcessor
from .common import FileDownloader
-from ..compat import (
- compat_urlparse,
- compat_urllib_request,
-)
+from .fragment import FragmentFD
+
+from ..compat import compat_urlparse
+from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..utils import (
encodeArgument,
encodeFilename,
@@ -33,6 +32,8 @@ class HlsFD(FileDownloader):
for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')]
args.append(encodeFilename(tmpfilename, True))
+ self._debug_cmd(args)
+
retval = subprocess.call(args)
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
@@ -51,54 +52,50 @@ class HlsFD(FileDownloader):
return False
-class NativeHlsFD(FileDownloader):
+class NativeHlsFD(FragmentFD):
""" A more limited implementation that does not require ffmpeg """
+ FD_NAME = 'hlsnative'
+
def real_download(self, filename, info_dict):
- url = info_dict['url']
- self.report_destination(filename)
- tmpfilename = self.temp_name(filename)
+ man_url = info_dict['url']
+ self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
+ manifest = self.ydl.urlopen(man_url).read()
- self.to_screen(
- '[hlsnative] %s: Downloading m3u8 manifest' % info_dict['id'])
- data = self.ydl.urlopen(url).read()
- s = data.decode('utf-8', 'ignore')
- segment_urls = []
+ s = manifest.decode('utf-8', 'ignore')
+ fragment_urls = []
for line in s.splitlines():
line = line.strip()
if line and not line.startswith('#'):
segment_url = (
line
if re.match(r'^https?://', line)
- else compat_urlparse.urljoin(url, line))
- segment_urls.append(segment_url)
-
- is_test = self.params.get('test', False)
- remaining_bytes = self._TEST_FILE_SIZE if is_test else None
- byte_counter = 0
- with open(tmpfilename, 'wb') as outf:
- for i, segurl in enumerate(segment_urls):
- self.to_screen(
- '[hlsnative] %s: Downloading segment %d / %d' %
- (info_dict['id'], i + 1, len(segment_urls)))
- seg_req = compat_urllib_request.Request(segurl)
- if remaining_bytes is not None:
- seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1))
-
- segment = self.ydl.urlopen(seg_req).read()
- if remaining_bytes is not None:
- segment = segment[:remaining_bytes]
- remaining_bytes -= len(segment)
- outf.write(segment)
- byte_counter += len(segment)
- if remaining_bytes is not None and remaining_bytes <= 0:
+ else compat_urlparse.urljoin(man_url, line))
+ fragment_urls.append(segment_url)
+ # We only download the first fragment during the test
+ if self.params.get('test', False):
break
- self._hook_progress({
- 'downloaded_bytes': byte_counter,
- 'total_bytes': byte_counter,
+ ctx = {
'filename': filename,
- 'status': 'finished',
- })
- self.try_rename(tmpfilename, filename)
+ 'total_frags': len(fragment_urls),
+ }
+
+ self._prepare_and_start_frag_download(ctx)
+
+ frags_filenames = []
+ for i, frag_url in enumerate(fragment_urls):
+ frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
+ success = ctx['dl'].download(frag_filename, {'url': frag_url})
+ if not success:
+ return False
+ with open(frag_filename, 'rb') as down:
+ ctx['dest_stream'].write(down.read())
+ frags_filenames.append(frag_filename)
+
+ self._finish_frag_download(ctx)
+
+ for frag_file in frags_filenames:
+ os.remove(frag_file)
+
return True
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index b7f144af9..a29f5cf31 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -4,6 +4,7 @@ import errno
import os
import socket
import time
+import re
from .common import FileDownloader
from ..compat import (
@@ -57,6 +58,24 @@ class HttpFD(FileDownloader):
# Establish connection
try:
data = self.ydl.urlopen(request)
+ # When trying to resume, Content-Range HTTP header of response has to be checked
+ # to match the value of requested Range HTTP header. This is due to a webservers
+ # that don't support resuming and serve a whole file with no Content-Range
+ # set in response despite of requested Range (see
+ # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799)
+ if resume_len > 0:
+ content_range = data.headers.get('Content-Range')
+ if content_range:
+ content_range_m = re.search(r'bytes (\d+)-', content_range)
+ # Content-Range is present and matches requested Range, resume is possible
+ if content_range_m and resume_len == int(content_range_m.group(1)):
+ break
+ # Content-Range is either not present or invalid. Assuming remote webserver is
+ # trying to send the whole file, resume is not possible, so wiping the local file
+ # and performing entire redownload
+ self.report_unable_to_resume()
+ resume_len = 0
+ open_mode = 'wb'
break
except (compat_urllib_error.HTTPError, ) as err:
if (err.code < 500 or err.code >= 600) and err.code != 416:
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index 6865b5e2f..7d19bb808 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -131,7 +131,7 @@ class RtmpFD(FileDownloader):
if play_path is not None:
basic_args += ['--playpath', play_path]
if tc_url is not None:
- basic_args += ['--tcUrl', url]
+ basic_args += ['--tcUrl', tc_url]
if test:
basic_args += ['--stop', '1']
if flash_version is not None:
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 41af925cc..39b05ce8f 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -4,7 +4,10 @@ from .abc import ABCIE
from .abc7news import Abc7NewsIE
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
-from .adobetv import AdobeTVIE
+from .adobetv import (
+ AdobeTVIE,
+ AdobeTVVideoIE,
+)
from .adultswim import AdultSwimIE
from .aftenposten import AftenpostenIE
from .aftonbladet import AftonbladetIE
@@ -16,9 +19,14 @@ from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
from .aparat import AparatIE
+from .appleconnect import AppleConnectIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
-from .ard import ARDIE, ARDMediathekIE
+from .ard import (
+ ARDIE,
+ ARDMediathekIE,
+ SportschauIE,
+)
from .arte import (
ArteTvIE,
ArteTVPlus7IE,
@@ -35,7 +43,10 @@ from .azubu import AzubuIE
from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
-from .bbccouk import BBCCoUkIE
+from .bbc import (
+ BBCCoUkIE,
+ BBCIE,
+)
from .beeg import BeegIE
from .behindkink import BehindKinkIE
from .beatportpro import BeatportProIE
@@ -103,15 +114,21 @@ from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
DailymotionUserIE,
+ DailymotionCloudIE,
)
from .daum import DaumIE
from .dbtv import DBTVIE
+from .dcn import DCNIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
from .douyutv import DouyuTVIE
+from .dramafever import (
+ DramaFeverIE,
+ DramaFeverSeriesIE,
+)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
@@ -136,11 +153,12 @@ from .ellentv import (
)
from .elpais import ElPaisIE
from .embedly import EmbedlyIE
-from .empflix import EMPFlixIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
from .eroprofile import EroProfileIE
from .escapist import EscapistIE
+from .espn import ESPNIE
+from .esri import EsriVideoIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
@@ -148,10 +166,10 @@ from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
-from .firedrive import FiredriveIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
@@ -223,12 +241,21 @@ from .imdb import (
ImdbIE,
ImdbListIE
)
-from .imgur import ImgurIE
+from .imgur import (
+ ImgurIE,
+ ImgurAlbumIE,
+)
from .ina import InaIE
+from .indavideo import (
+ IndavideoIE,
+ IndavideoEmbedIE,
+)
from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
+from .ir90tv import Ir90TvIE
from .ivi import (
IviIE,
IviCompilationIE
@@ -243,6 +270,7 @@ from .kaltura import KalturaIE
from .kanalplay import KanalPlayIE
from .kankan import KankanIE
from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
@@ -250,15 +278,27 @@ from .keek import KeekIE
from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
+from .kuwo import (
+ KuwoIE,
+ KuwoAlbumIE,
+ KuwoChartIE,
+ KuwoSingerIE,
+ KuwoCategoryIE,
+ KuwoMvIE,
+)
from .la7 import LA7IE
from .laola1tv import Laola1TvIE
+from .lecture2go import Lecture2GoIE
from .letv import (
LetvIE,
LetvTvIE,
LetvPlaylistIE
)
from .libsyn import LibsynIE
-from .lifenews import LifeNewsIE
+from .lifenews import (
+ LifeNewsIE,
+ LifeEmbedIE,
+)
from .liveleak import LiveLeakIE
from .livestream import (
LivestreamIE,
@@ -303,13 +343,16 @@ from .mtv import (
MTVIE,
MTVServicesEmbeddedIE,
MTVIggyIE,
+ MTVDEIE,
)
from .muenchentv import MuenchenTVIE
from .musicplayon import MusicPlayOnIE
from .musicvault import MusicVaultIE
from .muzu import MuzuTVIE
+from .mwave import MwaveIE
from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
+from .myvi import MyviIE
from .myvideo import MyVideoIE
from .myvidster import MyVidsterIE
from .nationalgeographic import NationalGeographicIE
@@ -320,19 +363,31 @@ from .nbc import (
NBCNewsIE,
NBCSportsIE,
NBCSportsVPlayerIE,
+ MSNBCIE,
+)
+from .ndr import (
+ NDRIE,
+ NJoyIE,
)
-from .ndr import NDRIE
from .ndtv import NDTVIE
from .netzkino import NetzkinoIE
from .nerdcubed import NerdCubedFeedIE
from .nerdist import NerdistIE
+from .neteasemusic import (
+ NetEaseMusicIE,
+ NetEaseMusicAlbumIE,
+ NetEaseMusicSingerIE,
+ NetEaseMusicListIE,
+ NetEaseMusicMvIE,
+ NetEaseMusicProgramIE,
+ NetEaseMusicDjRadioIE,
+)
from .newgrounds import NewgroundsIE
from .newstube import NewstubeIE
from .nextmedia import (
NextMediaIE,
NextMediaActionNewsIE,
- AppleDailyRealtimeNewsIE,
- AppleDailyAnimationNewsIE
+ AppleDailyIE,
)
from .nfb import NFBIE
from .nfl import NFLIE
@@ -346,15 +401,18 @@ from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
+from .nova import NovaIE
from .novamov import NovaMovIE
from .nowness import NownessIE
+from .nowtv import NowTVIE
from .nowvideo import NowVideoIE
from .npo import (
NPOIE,
NPOLiveIE,
NPORadioIE,
NPORadioFragmentIE,
- TegenlichtVproIE,
+ VPROIE,
+ WNLIE
)
from .nrk import (
NRKIE,
@@ -363,11 +421,18 @@ from .nrk import (
)
from .ntvde import NTVDeIE
from .ntvru import NTVRuIE
-from .nytimes import NYTimesIE
+from .nytimes import (
+ NYTimesIE,
+ NYTimesArticleIE,
+)
from .nuvid import NuvidIE
from .odnoklassniki import OdnoklassnikiIE
from .oktoberfesttv import OktoberfestTVIE
-from .ooyala import OoyalaIE
+from .onionstudios import OnionStudiosIE
+from .ooyala import (
+ OoyalaIE,
+ OoyalaExternalIE,
+)
from .openfilm import OpenFilmIE
from .orf import (
ORFTVthekIE,
@@ -378,16 +443,27 @@ from .orf import (
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
+from .periscope import (
+ PeriscopeIE,
+ QuickscopeIE,
+)
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
from .planetaplay import PlanetaPlayIE
from .pladform import PladformIE
from .played import PlayedIE
from .playfm import PlayFMIE
+from .playtvak import PlaytvakIE
from .playvid import PlayvidIE
from .playwire import PlaywireIE
+from .pluralsight import (
+ PluralsightIE,
+ PluralsightCourseIE,
+)
from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
from .pornhd import PornHdIE
from .pornhub import (
PornHubIE,
@@ -405,6 +481,8 @@ from .qqmusic import (
QQMusicIE,
QQMusicSingerIE,
QQMusicAlbumIE,
+ QQMusicToplistIE,
+ QQMusicPlaylistIE,
)
from .quickvid import QuickVidIE
from .r7 import R7IE
@@ -414,6 +492,7 @@ from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .rai import RaiIE
from .rbmaradio import RBMARadioIE
+from .rds import RDSIE
from .redtube import RedTubeIE
from .restudy import RestudyIE
from .reverbnation import ReverbNationIE
@@ -424,11 +503,11 @@ from .roxwel import RoxwelIE
from .rtbf import RTBFIE
from .rte import RteIE
from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
from .rtl2 import RTL2IE
from .rtp import RTPIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE
+from .rtvnh import RTVNHIE
from .ruhd import RUHDIE
from .rutube import (
RutubeIE,
@@ -438,6 +517,7 @@ from .rutube import (
RutubePersonIE,
)
from .rutv import RUTVIE
+from .ruutu import RuutuIE
from .sandia import SandiaIE
from .safari import (
SafariIE,
@@ -454,6 +534,7 @@ from .senateisvp import SenateISVPIE
from .servingsys import ServingSysIE
from .sexu import SexuIE
from .sexykarma import SexyKarmaIE
+from .shahid import ShahidIE
from .shared import SharedIE
from .sharesix import ShareSixIE
from .sina import SinaIE
@@ -465,9 +546,16 @@ from .smotri import (
SmotriUserIE,
SmotriBroadcastIE,
)
+from .snagfilms import (
+ SnagFilmsIE,
+ SnagFilmsEmbedIE,
+)
from .snotr import SnotrIE
-from .sockshare import SockshareIE
from .sohu import SohuIE
+from .soompi import (
+ SoompiIE,
+ SoompiShowIE,
+)
from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
@@ -480,8 +568,10 @@ from .soundgasm import (
)
from .southpark import (
SouthParkIE,
+ SouthParkDeIE,
+ SouthParkDkIE,
SouthParkEsIE,
- SouthparkDeIE,
+ SouthParkNlIE
)
from .space import SpaceIE
from .spankbang import SpankBangIE
@@ -490,7 +580,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
from .sport5 import Sport5IE
-from .sportbox import SportBoxIE
+from .sportbox import (
+ SportBoxIE,
+ SportBoxEmbedIE,
+)
from .sportdeutschland import SportDeutschlandIE
from .srf import SrfIE
from .srmediathek import SRMediathekIE
@@ -501,7 +594,10 @@ from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
from .sunporno import SunPornoIE
-from .svtplay import SVTPlayIE
+from .svt import (
+ SVTIE,
+ SVTPlayIE,
+)
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
@@ -518,6 +614,7 @@ from .techtalks import TechTalksIE
from .ted import TEDIE
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
+from .telegraaf import TelegraafIE
from .telemb import TeleMBIE
from .teletask import TeleTaskIE
from .tenplay import TenPlayIE
@@ -525,13 +622,24 @@ from .testurl import TestURLIE
from .testtube import TestTubeIE
from .tf1 import TF1IE
from .theonion import TheOnionIE
-from .theplatform import ThePlatformIE
+from .theplatform import (
+ ThePlatformIE,
+ ThePlatformFeedIE,
+)
from .thesixtyone import TheSixtyOneIE
+from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
-from .tmz import TMZIE
-from .tnaflix import TNAFlixIE
+from .tmz import (
+ TMZIE,
+ TMZArticleIE,
+)
+from .tnaflix import (
+ TNAFlixIE,
+ EMPFlixIE,
+ MovieFapIE,
+)
from .thvideo import (
THVideoIE,
THVideoPlaylistIE
@@ -542,12 +650,21 @@ from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tunein import TuneInIE
from .turbo import TurboIE
from .tutv import TutvIE
+from .tv2 import (
+ TV2IE,
+ TV2ArticleIE,
+)
from .tv4 import TV4IE
+from .tvc import (
+ TVCIE,
+ TVCArticleIE,
+)
from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
@@ -566,6 +683,7 @@ from .twitch import (
TwitchBookmarksIE,
TwitchStreamIE,
)
+from .twitter import TwitterCardIE
from .ubu import UbuIE
from .udemy import (
UdemyIE,
@@ -583,11 +701,14 @@ from .veoh import VeohIE
from .vessel import VesselIE
from .vesti import VestiIE
from .vevo import VevoIE
-from .vgtv import VGTVIE
+from .vgtv import (
+ BTArticleIE,
+ BTVestlendingenIE,
+ VGTVIE,
+)
from .vh1 import VH1IE
from .vice import ViceIE
from .viddler import ViddlerIE
-from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
@@ -614,12 +735,16 @@ from .vine import (
VineIE,
VineUserIE,
)
-from .viki import VikiIE
+from .viki import (
+ VikiIE,
+ VikiChannelIE,
+)
from .vk import (
VKIE,
VKUserVideosIE,
)
from .vodlocker import VodlockerIE
+from .voicerepublic import VoiceRepublicIE
from .vporn import VpornIE
from .vrt import VRTIE
from .vube import VubeIE
@@ -634,7 +759,10 @@ from .wdr import (
WDRMobileIE,
WDRMausIE,
)
-from .webofstories import WebOfStoriesIE
+from .webofstories import (
+ WebOfStoriesIE,
+ WebOfStoriesPlaylistIE,
+)
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
@@ -643,12 +771,16 @@ from .wrzuta import WrzutaIE
from .wsj import WSJIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
-from .xhamster import XHamsterIE
+from .xhamster import (
+ XHamsterIE,
+ XHamsterEmbedIE,
+)
from .xminus import XMinusIE
from .xnxx import XNXXIE
-from .xvideos import XVideosIE
+from .xstream import XstreamIE
from .xtube import XTubeUserIE, XTubeIE
from .xuite import XuiteIE
+from .xvideos import XVideosIE
from .xxxymovies import XXXYMoviesIE
from .yahoo import (
YahooIE,
@@ -661,6 +793,7 @@ from .yandexmusic import (
YandexMusicPlaylistIE,
)
from .yesjapan import YesJapanIE
+from .yinyuetai import YinYueTaiIE
from .ynet import YnetIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index dc0fb85d6..f9a389f67 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -1,16 +1,20 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ js_to_json,
+ int_or_none,
+)
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
_VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
'md5': 'cb3dd03b18455a661071ee1e28344d9f',
'info_dict': {
@@ -19,22 +23,47 @@ class ABCIE(InfoExtractor):
'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
},
- }
+ }, {
+ 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',
+ 'md5': 'db2a5369238b51f9811ad815b69dc086',
+ 'info_dict': {
+ 'id': 'NvqvPeNZsHU',
+ 'ext': 'mp4',
+ 'upload_date': '20150816',
+ 'uploader': 'ABC News (Australia)',
+ 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef',
+ 'uploader_id': 'NewsOnABC',
+ 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',
+ },
+ 'add_ie': ['Youtube'],
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- urls_info_json = self._search_regex(
- r'inlineVideoData\.push\((.*?)\);', webpage, 'video urls',
- flags=re.DOTALL)
- urls_info = json.loads(urls_info_json.replace('\'', '"'))
+ mobj = re.search(
+ r'inline(?P<type>Video|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
+ webpage)
+ if mobj is None:
+ raise ExtractorError('Unable to extract video urls')
+
+ urls_info = self._parse_json(
+ mobj.group('json_data'), video_id, transform_source=js_to_json)
+
+ if not isinstance(urls_info, list):
+ urls_info = [urls_info]
+
+ if mobj.group('type') == 'YouTube':
+ return self.playlist_result([
+ self.url_result(url_info['url']) for url_info in urls_info])
+
formats = [{
'url': url_info['url'],
- 'width': int(url_info['width']),
- 'height': int(url_info['height']),
- 'tbr': int(url_info['bitrate']),
- 'filesize': int(url_info['filesize']),
+ 'width': int_or_none(url_info.get('width')),
+ 'height': int_or_none(url_info.get('height')),
+ 'tbr': int_or_none(url_info.get('bitrate')),
+ 'filesize': int_or_none(url_info.get('filesize')),
} for url_info in urls_info]
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
index 97d128560..5e43adc51 100644
--- a/youtube_dl/extractor/adobetv.py
+++ b/youtube_dl/extractor/adobetv.py
@@ -5,6 +5,8 @@ from ..utils import (
parse_duration,
unified_strdate,
str_to_int,
+ float_or_none,
+ ISO639Utils,
)
@@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
'view_count': view_count,
'formats': formats,
}
+
+
+class AdobeTVVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+ _TEST = {
+ # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+ 'url': 'https://video.tv.adobe.com/v/2456/',
+ 'md5': '43662b577c018ad707a63766462b1e87',
+ 'info_dict': {
+ 'id': '2456',
+ 'ext': 'mp4',
+ 'title': 'New experience with Acrobat DC',
+ 'description': 'New experience with Acrobat DC',
+ 'duration': 248.667,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ player_params = self._parse_json(self._search_regex(
+ r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
+ video_id)
+
+ formats = [{
+ 'url': source['src'],
+ 'width': source.get('width'),
+ 'height': source.get('height'),
+ 'tbr': source.get('bitrate'),
+ } for source in player_params['sources']]
+
+ # For both metadata and downloaded files the duration varies among
+ # formats. I just pick the max one
+ duration = max(filter(None, [
+ float_or_none(source.get('duration'), scale=1000)
+ for source in player_params['sources']]))
+
+ subtitles = {}
+ for translation in player_params.get('translations', []):
+ lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+ if lang_id not in subtitles:
+ subtitles[lang_id] = []
+ subtitles[lang_id].append({
+ 'url': translation['vttPath'],
+ 'ext': 'vtt',
+ })
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': player_params['title'],
+ 'description': self._og_search_description(webpage),
+ 'duration': duration,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py
index e15c015fb..0c00acfb5 100644
--- a/youtube_dl/extractor/aftenposten.py
+++ b/youtube_dl/extractor/aftenposten.py
@@ -1,21 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_iso8601,
- xpath_with_ns,
- xpath_text,
- find_xpath_attr,
-)
class AftenpostenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)'
-
_TEST = {
'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more',
'md5': 'fd828cd29774a729bf4d4425fe192972',
@@ -30,69 +20,4 @@ class AftenpostenIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- data = self._download_xml(
- 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id)
-
- NS_MAP = {
- 'atom': 'http://www.w3.org/2005/Atom',
- 'xt': 'http://xstream.dk/',
- 'media': 'http://search.yahoo.com/mrss/',
- }
-
- entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
-
- title = xpath_text(
- entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
- description = xpath_text(
- entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
- timestamp = parse_iso8601(xpath_text(
- entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
-
- formats = []
- media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
- for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
- media_url = media_content.get('url')
- if not media_url:
- continue
- tbr = int_or_none(media_content.get('bitrate'))
- mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
- if mobj:
- formats.append({
- 'url': mobj.group('url'),
- 'play_path': 'mp4:%s' % mobj.group('playpath'),
- 'app': mobj.group('app'),
- 'ext': 'flv',
- 'tbr': tbr,
- 'format_id': 'rtmp-%d' % tbr,
- })
- else:
- formats.append({
- 'url': media_url,
- 'tbr': tbr,
- })
- self._sort_formats(formats)
-
- link = find_xpath_attr(
- entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
- if link is not None:
- formats.append({
- 'url': link.get('href'),
- 'format_id': link.get('rel'),
- })
-
- thumbnails = [{
- 'url': splash.get('url'),
- 'width': int_or_none(splash.get('width')),
- 'height': int_or_none(splash.get('height')),
- } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'timestamp': timestamp,
- 'formats': formats,
- 'thumbnails': thumbnails,
- }
+ return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream')
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
index a117502bc..e0518cf26 100644
--- a/youtube_dl/extractor/aftonbladet.py
+++ b/youtube_dl/extractor/aftonbladet.py
@@ -6,11 +6,11 @@ from ..utils import int_or_none
class AftonbladetIE(InfoExtractor):
- _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])'
+ _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+ 'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'info_dict': {
- 'id': 'article36015',
+ 'id': '36015',
'ext': 'mp4',
'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
# find internal video meta data
meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
- internal_meta_id = self._html_search_regex(
- r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+ player_config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+ internal_meta_id = player_config['videoId']
internal_meta_url = meta_url % internal_meta_id
internal_meta_json = self._download_json(
internal_meta_url, video_id, 'Downloading video meta data')
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
new file mode 100644
index 000000000..ea7a70393
--- /dev/null
+++ b/youtube_dl/extractor/appleconnect.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ str_to_int,
+ ExtractorError
+)
+
+
+class AppleConnectIE(InfoExtractor):
+ _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
+ _TEST = {
+ 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+ 'md5': '10d0f2799111df4cb1c924520ca78f98',
+ 'info_dict': {
+ 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+ 'ext': 'm4v',
+ 'title': 'Energy',
+ 'uploader': 'Drake',
+ 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+ 'upload_date': '20150710',
+ 'timestamp': 1436545535,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ try:
+ video_json = self._html_search_regex(
+ r'class="auc-video-data">(\{.*?\})', webpage, 'json')
+ except ExtractorError:
+ raise ExtractorError('This post doesn\'t contain a video', expected=True)
+
+ video_data = self._parse_json(video_json, video_id)
+ timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
+ like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+
+ return {
+ 'id': video_id,
+ 'url': video_data['sslSrc'],
+ 'title': video_data['title'],
+ 'description': video_data['description'],
+ 'uploader': video_data['artistName'],
+ 'thumbnail': video_data['artworkUrl'],
+ 'timestamp': timestamp,
+ 'like_count': like_count,
+ }
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index 9fc35a42b..8feb7cb74 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -33,7 +33,7 @@ class ArchiveOrgIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- json_url = url + ('?' if '?' in url else '&') + 'output=json'
+ json_url = url + ('&' if '?' in url else '?') + 'output=json'
data = self._download_json(json_url, video_id)
def get_optional(data_dict, field):
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 6a35ea463..6f465789b 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -8,6 +8,7 @@ from .generic import GenericIE
from ..utils import (
determine_ext,
ExtractorError,
+ get_element_by_attribute,
qualities,
int_or_none,
parse_duration,
@@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor):
_VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
- 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
- 'only_matching': True,
+ 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114',
+ 'info_dict': {
+ 'id': '29582122',
+ 'ext': 'mp4',
+ 'title': 'Ich liebe das Leben trotzdem',
+ 'description': 'md5:45e4c225c72b27993314b31a84a5261c',
+ 'duration': 4557,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
+ 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
+ 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
'info_dict': {
- 'id': '22490580',
+ 'id': '29522730',
'ext': 'mp4',
- 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
- 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
+ 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)',
+ 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
+ 'duration': 5252,
},
- 'skip': 'Blocked outside of Germany',
+ }, {
+ # audio
+ 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+ 'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
+ 'info_dict': {
+ 'id': '28488308',
+ 'ext': 'mp3',
+ 'title': 'Tod eines Fußballers',
+ 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
+ 'duration': 3240,
+ },
+ }, {
+ 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+ 'only_matching': True,
}]
+ def _extract_media_info(self, media_info_url, webpage, video_id):
+ media_info = self._download_json(
+ media_info_url, video_id, 'Downloading media JSON')
+
+ formats = self._extract_formats(media_info, video_id)
+
+ if not formats:
+ if '"fsk"' in webpage:
+ raise ExtractorError(
+ 'This video is only available after 20:00', expected=True)
+ elif media_info.get('_geoblocked'):
+ raise ExtractorError('This video is not available due to geo restriction', expected=True)
+
+ self._sort_formats(formats)
+
+ duration = int_or_none(media_info.get('_duration'))
+ thumbnail = media_info.get('_previewImage')
+
+ subtitles = {}
+ subtitle_url = media_info.get('_subtitleUrl')
+ if subtitle_url:
+ subtitles['de'] = [{
+ 'ext': 'srt',
+ 'url': subtitle_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _extract_formats(self, media_info, video_id):
+ type_ = media_info.get('_type')
+ media_array = media_info.get('_mediaArray', [])
+ formats = []
+ for num, media in enumerate(media_array):
+ for stream in media.get('_mediaStreamArray', []):
+ stream_urls = stream.get('_stream')
+ if not stream_urls:
+ continue
+ if not isinstance(stream_urls, list):
+ stream_urls = [stream_urls]
+ quality = stream.get('_quality')
+ server = stream.get('_server')
+ for stream_url in stream_urls:
+ ext = determine_ext(stream_url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+ video_id, preference=-1, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
+ else:
+ if server and server.startswith('rtmp'):
+ f = {
+ 'url': server,
+ 'play_path': stream_url,
+ 'format_id': 'a%s-rtmp-%s' % (num, quality),
+ }
+ elif stream_url.startswith('http'):
+ f = {
+ 'url': stream_url,
+ 'format_id': 'a%s-%s-%s' % (num, ext, quality)
+ }
+ else:
+ continue
+ m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
+ if m:
+ f.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ if type_ == 'audio':
+ f['vcodec'] = 'none'
+ formats.append(f)
+ return formats
+
def _real_extract(self, url):
# determine video id from url
m = re.match(self._VALID_URL, url)
@@ -92,46 +199,22 @@ class ARDMediathekIE(InfoExtractor):
'format_id': fid,
'url': furl,
})
+ self._sort_formats(formats)
+ info = {
+ 'formats': formats,
+ }
else: # request JSON file
- media_info = self._download_json(
- 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
- # The second element of the _mediaArray contains the standard http urls
- streams = media_info['_mediaArray'][1]['_mediaStreamArray']
- if not streams:
- if '"fsk"' in webpage:
- raise ExtractorError('This video is only available after 20:00')
-
- formats = []
- for s in streams:
- if type(s['_stream']) == list:
- for index, url in enumerate(s['_stream'][::-1]):
- quality = s['_quality'] + index
- formats.append({
- 'quality': quality,
- 'url': url,
- 'format_id': '%s-%s' % (determine_ext(url), quality)
- })
- continue
-
- format = {
- 'quality': s['_quality'],
- 'url': s['_stream'],
- }
-
- format['format_id'] = '%s-%s' % (
- determine_ext(format['url']), format['quality'])
+ info = self._extract_media_info(
+ 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
- formats.append(format)
-
- self._sort_formats(formats)
-
- return {
+ info.update({
'id': video_id,
'title': title,
'description': description,
- 'formats': formats,
'thumbnail': thumbnail,
- }
+ })
+
+ return info
class ARDIE(InfoExtractor):
@@ -189,3 +272,41 @@ class ARDIE(InfoExtractor):
'upload_date': upload_date,
'thumbnail': thumbnail,
}
+
+
+class SportschauIE(ARDMediathekIE):
+ IE_NAME = 'Sportschau'
+ _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
+ _TESTS = [{
+ 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
+ 'info_dict': {
+ 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
+ 'ext': 'mp4',
+ 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ base_url = mobj.group('baseurl')
+
+ webpage = self._download_webpage(url, video_id)
+ title = get_element_by_attribute('class', 'headline', webpage)
+ description = self._html_search_meta('description', webpage, 'description')
+
+ info = self._extract_media_info(
+ base_url + '-mc_defaultQuality-h.json', webpage, video_id)
+
+ info.update({
+ 'title': title,
+ 'description': description,
+ })
+
+ return info
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 8273bd6c9..76de24477 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -7,7 +7,6 @@ from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
unified_strdate,
- get_element_by_id,
get_element_by_attribute,
int_or_none,
qualities,
@@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
def _real_extract(self, url):
anchor_id, lang = self._extract_url_info(url)
webpage = self._download_webpage(url, anchor_id)
- row = get_element_by_id(anchor_id, webpage)
+ row = self._search_regex(
+ r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+ webpage, 'row')
return self._extract_from_webpage(row, anchor_id, lang)
diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py
index 906895c1e..e37ee4440 100644
--- a/youtube_dl/extractor/baidu.py
+++ b/youtube_dl/extractor/baidu.py
@@ -8,6 +8,7 @@ from ..compat import compat_urlparse
class BaiduVideoIE(InfoExtractor):
+ IE_DESC = '百度视频'
_VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
_TESTS = [{
'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
new file mode 100644
index 000000000..abc5a44a1
--- /dev/null
+++ b/youtube_dl/extractor/bbc.py
@@ -0,0 +1,780 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+)
+from ..compat import compat_HTTPError
+
+
+class BBCCoUkIE(InfoExtractor):
+ IE_NAME = 'bbc.co.uk'
+ IE_DESC = 'BBC iPlayer'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
+
+ _MEDIASELECTOR_URLS = [
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+ ]
+
+ _TESTS = [
+ {
+ 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
+ 'info_dict': {
+ 'id': 'b039d07m',
+ 'ext': 'flv',
+ 'title': 'Kaleidoscope, Leonard Cohen',
+ 'description': 'The Canadian poet and songwriter reflects on his musical career.',
+ 'duration': 1740,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
+ 'info_dict': {
+ 'id': 'b00yng1d',
+ 'ext': 'flv',
+ 'title': 'The Man in Black: Series 3: The Printed Name',
+ 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
+ 'duration': 1800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Episode is no longer available on BBC iPlayer Radio',
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
+ 'info_dict': {
+ 'id': 'b00yng1d',
+ 'ext': 'flv',
+ 'title': 'The Voice UK: Series 3: Blind Auditions 5',
+ 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
+ 'duration': 5100,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
+ 'info_dict': {
+ 'id': 'b03k3pb7',
+ 'ext': 'flv',
+ 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
+ 'description': '2. Invasion',
+ 'duration': 3600,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
+ 'info_dict': {
+ 'id': 'b04v209v',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, The Essential New Tune Special',
+ 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
+ 'note': 'Audio',
+ 'info_dict': {
+ 'id': 'p02frcch',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
+ 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
+ 'duration': 3507,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
+ 'note': 'Video',
+ 'info_dict': {
+ 'id': 'p025c103',
+ 'ext': 'flv',
+ 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
+ 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
+ 'duration': 226,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
+ 'info_dict': {
+ 'id': 'p02n76xf',
+ 'ext': 'flv',
+ 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
+ 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
+ 'duration': 3540,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'geolocation',
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
+ 'info_dict': {
+ 'id': 'b05zmgw1',
+ 'ext': 'flv',
+ 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
+ 'title': 'Royal Academy Summer Exhibition',
+ 'duration': 3540,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'geolocation',
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
+ 'only_matching': True,
+ }
+ ]
+
+ class MediaSelectionError(Exception):
+ def __init__(self, id):
+ self.id = id
+
+ def _extract_asx_playlist(self, connection, programme_id):
+ asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
+ return [ref.get('href') for ref in asx.findall('./Entry/ref')]
+
+ def _extract_connection(self, connection, programme_id):
+ formats = []
+ protocol = connection.get('protocol')
+ supplier = connection.get('supplier')
+ if protocol == 'http':
+ href = connection.get('href')
+ transfer_format = connection.get('transferFormat')
+ # ASX playlist
+ if supplier == 'asx':
+ for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
+ formats.append({
+ 'url': ref,
+ 'format_id': 'ref%s_%s' % (i, supplier),
+ })
+ # Skip DASH until supported
+ elif transfer_format == 'dash':
+ pass
+ # Direct link
+ else:
+ formats.append({
+ 'url': href,
+ 'format_id': supplier,
+ })
+ elif protocol == 'rtmp':
+ application = connection.get('application', 'ondemand')
+ auth_string = connection.get('authString')
+ identifier = connection.get('identifier')
+ server = connection.get('server')
+ formats.append({
+ 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
+ 'play_path': identifier,
+ 'app': '%s?%s' % (application, auth_string),
+ 'page_url': 'http://www.bbc.co.uk',
+ 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
+ 'rtmp_live': False,
+ 'ext': 'flv',
+ 'format_id': supplier,
+ })
+ return formats
+
+ def _extract_items(self, playlist):
+ return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
+
+ def _extract_medias(self, media_selection):
+ error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
+ if error is not None:
+ raise BBCCoUkIE.MediaSelectionError(error.get('id'))
+ return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
+
+ def _extract_connections(self, media):
+ return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
+
+ def _extract_video(self, media, programme_id):
+ formats = []
+ vbr = int_or_none(media.get('bitrate'))
+ vcodec = media.get('encoding')
+ service = media.get('service')
+ width = int_or_none(media.get('width'))
+ height = int_or_none(media.get('height'))
+ file_size = int_or_none(media.get('media_file_size'))
+ for connection in self._extract_connections(media):
+ conn_formats = self._extract_connection(connection, programme_id)
+ for format in conn_formats:
+ format.update({
+ 'format_id': '%s_%s' % (service, format['format_id']),
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
+ 'vcodec': vcodec,
+ 'filesize': file_size,
+ })
+ formats.extend(conn_formats)
+ return formats
+
+ def _extract_audio(self, media, programme_id):
+ formats = []
+ abr = int_or_none(media.get('bitrate'))
+ acodec = media.get('encoding')
+ service = media.get('service')
+ for connection in self._extract_connections(media):
+ conn_formats = self._extract_connection(connection, programme_id)
+ for format in conn_formats:
+ format.update({
+ 'format_id': '%s_%s' % (service, format['format_id']),
+ 'abr': abr,
+ 'acodec': acodec,
+ })
+ formats.extend(conn_formats)
+ return formats
+
+ def _get_subtitles(self, media, programme_id):
+ subtitles = {}
+ for connection in self._extract_connections(media):
+ captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
+ lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
+ subtitles[lang] = [
+ {
+ 'url': connection.get('href'),
+ 'ext': 'ttml',
+ },
+ ]
+ return subtitles
+
+ def _raise_extractor_error(self, media_selection_error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+ expected=True)
+
+ def _download_media_selector(self, programme_id):
+ last_exception = None
+ for mediaselector_url in self._MEDIASELECTOR_URLS:
+ try:
+ return self._download_media_selector_url(
+ mediaselector_url % programme_id, programme_id)
+ except BBCCoUkIE.MediaSelectionError as e:
+ if e.id == 'notukerror':
+ last_exception = e
+ continue
+ self._raise_extractor_error(e)
+ self._raise_extractor_error(last_exception)
+
+ def _download_media_selector_url(self, url, programme_id=None):
+ try:
+ media_selection = self._download_xml(
+ url, programme_id, 'Downloading media selection XML')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
+ else:
+ raise
+ return self._process_media_selector(media_selection, programme_id)
+
+ def _process_media_selector(self, media_selection, programme_id):
+ formats = []
+ subtitles = None
+
+ for media in self._extract_medias(media_selection):
+ kind = media.get('kind')
+ if kind == 'audio':
+ formats.extend(self._extract_audio(media, programme_id))
+ elif kind == 'video':
+ formats.extend(self._extract_video(media, programme_id))
+ elif kind == 'captions':
+ subtitles = self.extract_subtitles(media, programme_id)
+ return formats, subtitles
+
+ def _download_playlist(self, playlist_id):
+ try:
+ playlist = self._download_json(
+ 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+ playlist_id, 'Downloading playlist JSON')
+
+ version = playlist.get('defaultAvailableVersion')
+ if version:
+ smp_config = version['smpConfig']
+ title = smp_config['title']
+ description = smp_config['summary']
+ for item in smp_config['items']:
+ kind = item['kind']
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ programme_id = item.get('vpid')
+ duration = int_or_none(item.get('duration'))
+ formats, subtitles = self._download_media_selector(programme_id)
+ return programme_id, title, description, duration, formats, subtitles
+ except ExtractorError as ee:
+ if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+ raise
+
+ # fallback to legacy playlist
+ return self._process_legacy_playlist(playlist_id)
+
+ def _process_legacy_playlist_url(self, url, display_id):
+ playlist = self._download_legacy_playlist_url(url, display_id)
+ return self._extract_from_legacy_playlist(playlist, display_id)
+
+ def _process_legacy_playlist(self, playlist_id):
+ return self._process_legacy_playlist_url(
+ 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
+
+ def _download_legacy_playlist_url(self, url, playlist_id=None):
+ return self._download_xml(
+ url, playlist_id, 'Downloading legacy playlist XML')
+
+ def _extract_from_legacy_playlist(self, playlist, playlist_id):
+ no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
+ if no_items is not None:
+ reason = no_items.get('reason')
+ if reason == 'preAvailability':
+ msg = 'Episode %s is not yet available' % playlist_id
+ elif reason == 'postAvailability':
+ msg = 'Episode %s is no longer available' % playlist_id
+ elif reason == 'noMedia':
+ msg = 'Episode %s is not currently available' % playlist_id
+ else:
+ msg = 'Episode %s is not available: %s' % (playlist_id, reason)
+ raise ExtractorError(msg, expected=True)
+
+ for item in self._extract_items(playlist):
+ kind = item.get('kind')
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
+ description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
+
+ def get_programme_id(item):
+ def get_from_attributes(item):
+ for p in('identifier', 'group'):
+ value = item.get(p)
+ if value and re.match(r'^[pb][\da-z]{7}$', value):
+ return value
+ get_from_attributes(item)
+ mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator')
+ if mediator is not None:
+ return get_from_attributes(mediator)
+
+ programme_id = get_programme_id(item)
+ duration = int_or_none(item.get('duration'))
+ # TODO: programme_id can be None and media items can be incorporated right inside
+ # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
+ # as f4m and m3u8
+ formats, subtitles = self._download_media_selector(programme_id)
+
+ return programme_id, title, description, duration, formats, subtitles
+
+ def _real_extract(self, url):
+ group_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, group_id, 'Downloading video page')
+
+ programme_id = None
+
+ tviplayer = self._search_regex(
+ r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
+ webpage, 'player', default=None)
+
+ if tviplayer:
+ player = self._parse_json(tviplayer, group_id).get('player', {})
+ duration = int_or_none(player.get('duration'))
+ programme_id = player.get('vpid')
+
+ if not programme_id:
+ programme_id = self._search_regex(
+ r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
+
+ if programme_id:
+ formats, subtitles = self._download_media_selector(programme_id)
+ title = self._og_search_title(webpage)
+ description = self._search_regex(
+ r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
+ webpage, 'description', fatal=False)
+ else:
+ programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class BBCIE(BBCCoUkIE):
+ IE_NAME = 'bbc'
+ IE_DESC = 'BBC'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+
+ _MEDIASELECTOR_URLS = [
+ # Provides more formats, namely direct mp4 links, but fails on some videos with
+ # notukerror for non UK (?) users (e.g.
+ # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+ # Provides fewer formats, but works everywhere for everybody (hopefully)
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+ ]
+
+ _TESTS = [{
+ # article with multiple videos embedded with data-media-meta containing
+ # playlist.sxml, externalId and no direct video links
+ 'url': 'http://www.bbc.com/news/world-europe-32668511',
+ 'info_dict': {
+ 'id': 'world-europe-32668511',
+ 'title': 'Russia stages massive WW2 parade despite Western boycott',
+ 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
+ },
+ 'playlist_count': 2,
+ }, {
+ # article with multiple videos embedded with data-media-meta (more videos)
+ 'url': 'http://www.bbc.com/news/business-28299555',
+ 'info_dict': {
+ 'id': 'business-28299555',
+ 'title': 'Farnborough Airshow: Video highlights',
+ 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
+ },
+ 'playlist_count': 9,
+ 'skip': 'Save time',
+ }, {
+ # article with multiple videos embedded with `new SMP()`
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+ 'info_dict': {
+ 'id': '3662a707-0af9-3149-963f-47bea720b460',
+ 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_count': 18,
+ }, {
+ # single video embedded with mediaAssetPage.init()
+ 'url': 'http://www.bbc.com/news/world-europe-32041533',
+ 'info_dict': {
+ 'id': 'p02mprgb',
+ 'ext': 'mp4',
+ 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+ 'duration': 47,
+ 'timestamp': 1427219242,
+ 'upload_date': '20150324',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # article with single video embedded with data-media-meta containing
+ # direct video links (for now these are extracted) and playlist.xml (with
+ # media items as f4m and m3u8 - currently unsupported)
+ 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
+ 'info_dict': {
+ 'id': '150615_telabyad_kentin_cogu',
+ 'ext': 'mp4',
+ 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
+ 'duration': 47,
+ 'timestamp': 1434397334,
+ 'upload_date': '20150615',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # single video embedded with mediaAssetPage.init() (regional section)
+ 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
+ 'info_dict': {
+ 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+ 'ext': 'mp4',
+ 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
+ 'duration': 87,
+ 'timestamp': 1434713142,
+ 'upload_date': '20150619',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # single video from video playlist embedded with vxp-playlist-data JSON
+ 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
+ 'info_dict': {
+ 'id': 'p02w6qjc',
+ 'ext': 'mp4',
+ 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+ 'duration': 56,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # single video story with digitalData
+ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
+ 'info_dict': {
+ 'id': 'p02q6gc4',
+ 'ext': 'flv',
+ 'title': 'Sri Lanka’s spicy secret',
+ 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
+ 'timestamp': 1437674293,
+ 'upload_date': '20150723',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # single video story without digitalData
+ 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
+ 'info_dict': {
+ 'id': 'p018zqqg',
+ 'ext': 'mp4',
+ 'title': 'Hyundai Santa Fe Sport: Rock star',
+ 'description': 'md5:b042a26142c4154a6e472933cf20793d',
+ 'timestamp': 1368473503,
+ 'upload_date': '20130513',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # single video with playlist.sxml URL
+ 'url': 'http://www.bbc.com/sport/0/football/33653409',
+ 'info_dict': {
+ 'id': 'p02xycnp',
+ 'ext': 'mp4',
+ 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
+ 'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
+ 'duration': 140,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # single video with playlist URL from weather section
+ 'url': 'http://www.bbc.com/weather/features/33601775',
+ 'only_matching': True,
+ }, {
+ # custom redirection to www.bbc.com
+ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url)
+
+ def _extract_from_media_meta(self, media_meta, video_id):
+ # Direct links to media in media metadata (e.g.
+ # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
+ # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
+ source_files = media_meta.get('sourceFiles')
+ if source_files:
+ return [{
+ 'url': f['url'],
+ 'format_id': format_id,
+ 'ext': f.get('encoding'),
+ 'tbr': float_or_none(f.get('bitrate'), 1000),
+ 'filesize': int_or_none(f.get('filesize')),
+ } for format_id, f in source_files.items() if f.get('url')], []
+
+ programme_id = media_meta.get('externalId')
+ if programme_id:
+ return self._download_media_selector(programme_id)
+
+ # Process playlist.sxml as legacy playlist
+ href = media_meta.get('href')
+ if href:
+ playlist = self._download_legacy_playlist_url(href)
+ _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
+ return formats, subtitles
+
+ return [], []
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ timestamp = parse_iso8601(self._search_regex(
+ [r'"datePublished":\s*"([^"]+)',
+ r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
+ r'itemprop="datePublished"[^>]+datetime="([^"]+)"'],
+ webpage, 'date', default=None))
+
+ # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng)
+ playlist = self._search_regex(
+ r'<param[^>]+name="playlist"[^>]+value="([^"]+)"',
+ webpage, 'playlist', default=None)
+ if playlist:
+ programme_id, title, description, duration, formats, subtitles = \
+ self._process_legacy_playlist_url(playlist, playlist_id)
+ self._sort_formats(formats)
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ programme_id = self._search_regex(
+ [r'data-video-player-vpid="([\da-z]{8})"',
+ r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'],
+ webpage, 'vpid', default=None)
+ if programme_id:
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
+ digital_data = self._parse_json(
+ self._search_regex(
+ r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
+ programme_id, fatal=False)
+ page_info = digital_data.get('page', {}).get('pageInfo', {})
+ title = page_info.get('pageName') or self._og_search_title(webpage)
+ description = page_info.get('description') or self._og_search_description(webpage)
+ timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ playlist_title = self._html_search_regex(
+ r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
+ playlist_description = self._og_search_description(webpage, default=None)
+
+ def extract_all(pattern):
+ return list(filter(None, map(
+ lambda s: self._parse_json(s, playlist_id, fatal=False),
+ re.findall(pattern, webpage))))
+
+ # Multiple video article (e.g.
+ # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+ EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+ entries = []
+ for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+ embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+ if embed_url and re.match(EMBED_URL, embed_url):
+ entries.append(embed_url)
+ entries.extend(re.findall(
+ r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+ if entries:
+ return self.playlist_result(
+ [self.url_result(entry, 'BBCCoUk') for entry in entries],
+ playlist_id, playlist_title, playlist_description)
+
+ # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
+ medias = extract_all(r"data-media-meta='({[^']+})'")
+
+ if not medias:
+ # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
+ media_asset = self._search_regex(
+ r'mediaAssetPage\.init\(\s*({.+?}), "/',
+ webpage, 'media asset', default=None)
+ if media_asset:
+ media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
+ medias = []
+ for video in media_asset_page.get('videos', {}).values():
+ medias.extend(video.values())
+
+ if not medias:
+ # Multiple video playlist with single `now playing` entry (e.g.
+ # http://www.bbc.com/news/video_and_audio/must_see/33767813)
+ vxp_playlist = self._parse_json(
+ self._search_regex(
+ r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
+ webpage, 'playlist data'),
+ playlist_id)
+ playlist_medias = []
+ for item in vxp_playlist:
+ media = item.get('media')
+ if not media:
+ continue
+ playlist_medias.append(media)
+ # Download single video if found media with asset id matching the video id from URL
+ if item.get('advert', {}).get('assetId') == playlist_id:
+ medias = [media]
+ break
+ # Fallback to the whole playlist
+ if not medias:
+ medias = playlist_medias
+
+ entries = []
+ for num, media_meta in enumerate(medias, start=1):
+ formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
+ if not formats:
+ continue
+ self._sort_formats(formats)
+
+ video_id = media_meta.get('externalId')
+ if not video_id:
+ video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
+
+ title = media_meta.get('caption')
+ if not title:
+ title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
+
+ duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
+
+ images = []
+ for image in media_meta.get('images', {}).values():
+ images.extend(image.values())
+ if 'image' in media_meta:
+ images.append(media_meta['image'])
+
+ thumbnails = [{
+ 'url': image.get('href'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in images]
+
+ entries.append({
+ 'id': video_id,
+ 'title': title,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+
+ return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py
deleted file mode 100644
index 249bc6bbd..000000000
--- a/youtube_dl/extractor/bbccouk.py
+++ /dev/null
@@ -1,380 +0,0 @@
-from __future__ import unicode_literals
-
-import xml.etree.ElementTree
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- int_or_none,
-)
-from ..compat import compat_HTTPError
-
-
-class BBCCoUkIE(InfoExtractor):
- IE_NAME = 'bbc.co.uk'
- IE_DESC = 'BBC iPlayer'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
-
- _TESTS = [
- {
- 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
- 'info_dict': {
- 'id': 'b039d07m',
- 'ext': 'flv',
- 'title': 'Kaleidoscope, Leonard Cohen',
- 'description': 'The Canadian poet and songwriter reflects on his musical career.',
- 'duration': 1740,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- },
- {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
- 'info_dict': {
- 'id': 'b00yng1d',
- 'ext': 'flv',
- 'title': 'The Man in Black: Series 3: The Printed Name',
- 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
- 'duration': 1800,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'Episode is no longer available on BBC iPlayer Radio',
- },
- {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
- 'info_dict': {
- 'id': 'b00yng1d',
- 'ext': 'flv',
- 'title': 'The Voice UK: Series 3: Blind Auditions 5',
- 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
- 'duration': 5100,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
- },
- {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
- 'info_dict': {
- 'id': 'b03k3pb7',
- 'ext': 'flv',
- 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
- 'description': '2. Invasion',
- 'duration': 3600,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
- }, {
- 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
- 'info_dict': {
- 'id': 'b04v209v',
- 'ext': 'flv',
- 'title': 'Pete Tong, The Essential New Tune Special',
- 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
- 'duration': 10800,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
- 'note': 'Audio',
- 'info_dict': {
- 'id': 'p02frcch',
- 'ext': 'flv',
- 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
- 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
- 'duration': 3507,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
- 'note': 'Video',
- 'info_dict': {
- 'id': 'p025c103',
- 'ext': 'flv',
- 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
- 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
- 'duration': 226,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
- 'info_dict': {
- 'id': 'p02n76xf',
- 'ext': 'flv',
- 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
- 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
- 'duration': 3540,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'geolocation',
- }, {
- 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
- 'only_matching': True,
- }, {
- 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
- 'only_matching': True,
- }, {
- 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
- 'only_matching': True,
- }
- ]
-
- def _extract_asx_playlist(self, connection, programme_id):
- asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
- return [ref.get('href') for ref in asx.findall('./Entry/ref')]
-
- def _extract_connection(self, connection, programme_id):
- formats = []
- protocol = connection.get('protocol')
- supplier = connection.get('supplier')
- if protocol == 'http':
- href = connection.get('href')
- # ASX playlist
- if supplier == 'asx':
- for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
- formats.append({
- 'url': ref,
- 'format_id': 'ref%s_%s' % (i, supplier),
- })
- # Direct link
- else:
- formats.append({
- 'url': href,
- 'format_id': supplier,
- })
- elif protocol == 'rtmp':
- application = connection.get('application', 'ondemand')
- auth_string = connection.get('authString')
- identifier = connection.get('identifier')
- server = connection.get('server')
- formats.append({
- 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
- 'play_path': identifier,
- 'app': '%s?%s' % (application, auth_string),
- 'page_url': 'http://www.bbc.co.uk',
- 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
- 'rtmp_live': False,
- 'ext': 'flv',
- 'format_id': supplier,
- })
- return formats
-
- def _extract_items(self, playlist):
- return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
-
- def _extract_medias(self, media_selection):
- error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
- if error is not None:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
- return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
-
- def _extract_connections(self, media):
- return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
-
- def _extract_video(self, media, programme_id):
- formats = []
- vbr = int(media.get('bitrate'))
- vcodec = media.get('encoding')
- service = media.get('service')
- width = int(media.get('width'))
- height = int(media.get('height'))
- file_size = int(media.get('media_file_size'))
- for connection in self._extract_connections(media):
- conn_formats = self._extract_connection(connection, programme_id)
- for format in conn_formats:
- format.update({
- 'format_id': '%s_%s' % (service, format['format_id']),
- 'width': width,
- 'height': height,
- 'vbr': vbr,
- 'vcodec': vcodec,
- 'filesize': file_size,
- })
- formats.extend(conn_formats)
- return formats
-
- def _extract_audio(self, media, programme_id):
- formats = []
- abr = int(media.get('bitrate'))
- acodec = media.get('encoding')
- service = media.get('service')
- for connection in self._extract_connections(media):
- conn_formats = self._extract_connection(connection, programme_id)
- for format in conn_formats:
- format.update({
- 'format_id': '%s_%s' % (service, format['format_id']),
- 'abr': abr,
- 'acodec': acodec,
- })
- formats.extend(conn_formats)
- return formats
-
- def _get_subtitles(self, media, programme_id):
- subtitles = {}
- for connection in self._extract_connections(media):
- captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
- lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
- ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
- srt = ''
-
- def _extract_text(p):
- if p.text is not None:
- stripped_text = p.text.strip()
- if stripped_text:
- return stripped_text
- return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
- for pos, p in enumerate(ps):
- srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
- subtitles[lang] = [
- {
- 'url': connection.get('href'),
- 'ext': 'ttml',
- },
- {
- 'data': srt,
- 'ext': 'srt',
- },
- ]
- return subtitles
-
- def _download_media_selector(self, programme_id):
- try:
- media_selection = self._download_xml(
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
- programme_id, 'Downloading media selection XML')
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
- media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
- else:
- raise
-
- formats = []
- subtitles = None
-
- for media in self._extract_medias(media_selection):
- kind = media.get('kind')
- if kind == 'audio':
- formats.extend(self._extract_audio(media, programme_id))
- elif kind == 'video':
- formats.extend(self._extract_video(media, programme_id))
- elif kind == 'captions':
- subtitles = self.extract_subtitles(media, programme_id)
-
- return formats, subtitles
-
- def _download_playlist(self, playlist_id):
- try:
- playlist = self._download_json(
- 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
- playlist_id, 'Downloading playlist JSON')
-
- version = playlist.get('defaultAvailableVersion')
- if version:
- smp_config = version['smpConfig']
- title = smp_config['title']
- description = smp_config['summary']
- for item in smp_config['items']:
- kind = item['kind']
- if kind != 'programme' and kind != 'radioProgramme':
- continue
- programme_id = item.get('vpid')
- duration = int(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
- return programme_id, title, description, duration, formats, subtitles
- except ExtractorError as ee:
- if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
- raise
-
- # fallback to legacy playlist
- playlist = self._download_xml(
- 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
- playlist_id, 'Downloading legacy playlist XML')
-
- no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
- if no_items is not None:
- reason = no_items.get('reason')
- if reason == 'preAvailability':
- msg = 'Episode %s is not yet available' % playlist_id
- elif reason == 'postAvailability':
- msg = 'Episode %s is no longer available' % playlist_id
- elif reason == 'noMedia':
- msg = 'Episode %s is not currently available' % playlist_id
- else:
- msg = 'Episode %s is not available: %s' % (playlist_id, reason)
- raise ExtractorError(msg, expected=True)
-
- for item in self._extract_items(playlist):
- kind = item.get('kind')
- if kind != 'programme' and kind != 'radioProgramme':
- continue
- title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
- description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
- programme_id = item.get('identifier')
- duration = int(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
-
- return programme_id, title, description, duration, formats, subtitles
-
- def _real_extract(self, url):
- group_id = self._match_id(url)
-
- webpage = self._download_webpage(url, group_id, 'Downloading video page')
-
- programme_id = None
-
- tviplayer = self._search_regex(
- r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
- webpage, 'player', default=None)
-
- if tviplayer:
- player = self._parse_json(tviplayer, group_id).get('player', {})
- duration = int_or_none(player.get('duration'))
- programme_id = player.get('vpid')
-
- if not programme_id:
- programme_id = self._search_regex(
- r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
-
- if programme_id:
- formats, subtitles = self._download_media_selector(programme_id)
- title = self._og_search_title(webpage)
- description = self._search_regex(
- r'<p class="medium-description">([^<]+)</p>',
- webpage, 'description', fatal=False)
- else:
- programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
-
- self._sort_formats(formats)
-
- return {
- 'id': programme_id,
- 'title': title,
- 'description': description,
- 'thumbnail': self._og_search_thumbnail(webpage, default=None),
- 'duration': duration,
- 'formats': formats,
- 'subtitles': subtitles,
- }
diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py
index d2abd4d77..03dad4636 100644
--- a/youtube_dl/extractor/bet.py
+++ b/youtube_dl/extractor/bet.py
@@ -1,7 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
xpath_text,
xpath_with_ns,
@@ -16,11 +16,11 @@ class BetIE(InfoExtractor):
{
'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
'info_dict': {
- 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',
+ 'id': 'news/national/2014/a-conversation-with-president-obama',
'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
'ext': 'flv',
- 'title': 'BET News Presents: A Conversation With President Obama',
- 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6',
+ 'title': 'A Conversation With President Obama',
+ 'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
'duration': 1534,
'timestamp': 1418075340,
'upload_date': '20141208',
@@ -35,7 +35,7 @@ class BetIE(InfoExtractor):
{
'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
'info_dict': {
- 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',
+ 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
'display_id': 'justice-for-ferguson-a-community-reacts',
'ext': 'flv',
'title': 'Justice for Ferguson: A Community Reacts',
@@ -57,10 +57,13 @@ class BetIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- media_url = compat_urllib_parse.unquote(self._search_regex(
+ media_url = compat_urllib_parse_unquote(self._search_regex(
[r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
webpage, 'media URL'))
+ video_id = self._search_regex(
+ r'/video/(.*)/_jcr_content/', media_url, 'video id')
+
mrss = self._download_xml(media_url, display_id)
item = mrss.find('./channel/item')
@@ -75,8 +78,6 @@ class BetIE(InfoExtractor):
description = xpath_text(
item, './description', 'description', fatal=False)
- video_id = xpath_text(item, './guid', 'video id', fatal=False)
-
timestamp = parse_iso8601(xpath_text(
item, xpath_with_ns('./dc:date', NS_MAP),
'upload date', fatal=False))
diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py
index 77b562d99..4d8cce1ef 100644
--- a/youtube_dl/extractor/bild.py
+++ b/youtube_dl/extractor/bild.py
@@ -2,7 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ fix_xml_ampersands,
+)
class BildIE(InfoExtractor):
@@ -15,7 +18,7 @@ class BildIE(InfoExtractor):
'id': '38184146',
'ext': 'mp4',
'title': 'BILD hat sie getestet',
- 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'duration': 196,
'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ',
}
@@ -25,7 +28,7 @@ class BildIE(InfoExtractor):
video_id = self._match_id(url)
xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml"
- doc = self._download_xml(xml_url, video_id)
+ doc = self._download_xml(xml_url, video_id, transform_source=fix_xml_ampersands)
duration = int_or_none(doc.attrib.get('duration'), scale=1000)
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 7ca835e31..ecc17ebeb 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
import re
import itertools
+import json
+import xml.etree.ElementTree as ET
from .common import InfoExtractor
from ..utils import (
@@ -39,8 +41,15 @@ class BiliBiliIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- if self._search_regex(r'(此视频不存在或被删除)', webpage, 'error message', default=None):
- raise ExtractorError('The video does not exist or was deleted', expected=True)
+ if '(此视频不存在或被删除)' in webpage:
+ raise ExtractorError(
+ 'The video does not exist or was deleted', expected=True)
+
+ if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage:
+ raise ExtractorError(
+ 'The video is not available in your region due to copyright reasons',
+ expected=True)
+
video_code = self._search_regex(
r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
@@ -67,11 +76,19 @@ class BiliBiliIE(InfoExtractor):
entries = []
- lq_doc = self._download_xml(
+ lq_page = self._download_webpage(
'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
video_id,
note='Downloading LQ video info'
)
+ try:
+ err_info = json.loads(lq_page)
+ raise ExtractorError(
+ 'BiliBili said: ' + err_info['error_text'], expected=True)
+ except ValueError:
+ pass
+
+ lq_doc = ET.fromstring(lq_page)
lq_durls = lq_doc.findall('./durl')
hq_doc = self._download_xml(
@@ -80,9 +97,11 @@ class BiliBiliIE(InfoExtractor):
note='Downloading HQ video info',
fatal=False,
)
- hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None)
-
- assert len(lq_durls) == len(hq_durls)
+ if hq_doc is not False:
+ hq_durls = hq_doc.findall('./durl')
+ assert len(lq_durls) == len(hq_durls)
+ else:
+ hq_durls = itertools.repeat(None)
i = 1
for lq_durl, hq_durl in zip(lq_durls, hq_durls):
@@ -93,7 +112,7 @@ class BiliBiliIE(InfoExtractor):
'filesize': int_or_none(
lq_durl.find('./size'), get_attr='text'),
}]
- if hq_durl:
+ if hq_durl is not None:
formats.append({
'format_id': 'hq',
'quality': 2,
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index fb56cd78d..c3296283d 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -5,7 +5,6 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_str,
compat_urllib_request,
compat_urlparse,
)
@@ -14,6 +13,8 @@ from ..utils import (
int_or_none,
parse_iso8601,
unescapeHTML,
+ xpath_text,
+ xpath_with_ns,
)
@@ -23,10 +24,10 @@ class BlipTVIE(InfoExtractor):
_TESTS = [
{
'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
- 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+ 'md5': '80baf1ec5c3d2019037c1c707d676b9f',
'info_dict': {
'id': '5779306',
- 'ext': 'mov',
+ 'ext': 'm4v',
'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
'timestamp': 1323138843,
@@ -100,6 +101,20 @@ class BlipTVIE(InfoExtractor):
'vcodec': 'none',
}
},
+ {
+ # missing duration
+ 'url': 'http://blip.tv/rss/flash/6700880',
+ 'info_dict': {
+ 'id': '6684191',
+ 'ext': 'm4v',
+ 'title': 'Cowboy Bebop: Gateway Shuffle Review',
+ 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8',
+ 'timestamp': 1386639757,
+ 'upload_date': '20131210',
+ 'uploader': 'sfdebris',
+ 'uploader_id': '706520',
+ }
+ }
]
@staticmethod
@@ -128,35 +143,34 @@ class BlipTVIE(InfoExtractor):
rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
- def blip(s):
- return '{http://blip.tv/dtd/blip/1.0}%s' % s
-
- def media(s):
- return '{http://search.yahoo.com/mrss/}%s' % s
-
- def itunes(s):
- return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
+ def _x(p):
+ return xpath_with_ns(p, {
+ 'blip': 'http://blip.tv/dtd/blip/1.0',
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+ })
item = rss.find('channel/item')
- video_id = item.find(blip('item_id')).text
- title = item.find('./title').text
- description = clean_html(compat_str(item.find(blip('puredescription')).text))
- timestamp = parse_iso8601(item.find(blip('datestamp')).text)
- uploader = item.find(blip('user')).text
- uploader_id = item.find(blip('userid')).text
- duration = int(item.find(blip('runtime')).text)
- media_thumbnail = item.find(media('thumbnail'))
- thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
- categories = [category.text for category in item.findall('category')]
+ video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id
+ title = xpath_text(item, 'title', 'title', fatal=True)
+ description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description'))
+ timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp'))
+ uploader = xpath_text(item, _x('blip:user'), 'uploader')
+ uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id')
+ duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration'))
+ media_thumbnail = item.find(_x('media:thumbnail'))
+ thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None
+ else xpath_text(item, 'image', 'thumbnail'))
+ categories = [category.text for category in item.findall('category') if category is not None]
formats = []
subtitles_urls = {}
- media_group = item.find(media('group'))
- for media_content in media_group.findall(media('content')):
+ media_group = item.find(_x('media:group'))
+ for media_content in media_group.findall(_x('media:content')):
url = media_content.get('url')
- role = media_content.get(blip('role'))
+ role = media_content.get(_x('blip:role'))
msg = self._download_webpage(
url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
video_id, 'Resolving URL for %s' % role)
@@ -175,8 +189,8 @@ class BlipTVIE(InfoExtractor):
'url': real_url,
'format_id': role,
'format_note': media_type,
- 'vcodec': media_content.get(blip('vcodec')) or 'none',
- 'acodec': media_content.get(blip('acodec')),
+ 'vcodec': media_content.get(_x('blip:vcodec')) or 'none',
+ 'acodec': media_content.get(_x('blip:acodec')),
'filesize': media_content.get('filesize'),
'width': int_or_none(media_content.get('width')),
'height': int_or_none(media_content.get('height')),
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index 45ba51732..66e394e10 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -16,27 +16,38 @@ class BRIE(InfoExtractor):
_TESTS = [
{
- 'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html',
- 'md5': '93556dd2bcb2948d9259f8670c516d59',
+ 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
+ 'md5': '83a0477cf0b8451027eb566d88b51106',
'info_dict': {
- 'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a',
+ 'id': '48f656ef-287e-486f-be86-459122db22cc',
'ext': 'mp4',
- 'title': 'Wenn das Traditions-Theater wackelt',
- 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',
- 'duration': 34,
- 'uploader': 'BR',
- 'upload_date': '20140802',
+ 'title': 'Die böse Überraschung',
+ 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung',
+ 'duration': 180,
+ 'uploader': 'Reinhard Weber',
+ 'upload_date': '20150422',
}
},
{
- 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
- 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
+ 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
+ 'md5': 'a44396d73ab6a68a69a568fae10705bb',
'info_dict': {
- 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab',
+ 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
+ 'ext': 'mp4',
+ 'title': 'Manfred Schreiber ist tot',
+ 'description': 'Abendschau kompakt: Manfred Schreiber ist tot',
+ 'duration': 26,
+ }
+ },
+ {
+ 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html',
+ 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
+ 'info_dict': {
+ 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
'ext': 'aac',
- 'title': '"Keine neuen Schulden im nächsten Jahr"',
- 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"',
- 'duration': 64,
+ 'title': 'Kurzweilig und sehr bewegend',
+ 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend',
+ 'duration': 296,
}
},
{
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index 809287d14..aa08051b1 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -18,6 +18,7 @@ class BreakIE(InfoExtractor):
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
+ 'age_limit': 13,
}
}, {
'url': 'http://www.break.com/video/ugc/baby-flex-2773063',
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 4f60d5366..4721c2293 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -13,6 +13,7 @@ from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
)
from ..utils import (
determine_ext,
@@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):
try:
object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
- except xml.etree.ElementTree.ParseError:
+ except compat_xml_parse_error:
return
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
@@ -156,6 +157,28 @@ class BrightcoveIE(InfoExtractor):
linkBase = find_param('linkBaseURL')
if linkBase is not None:
params['linkBaseURL'] = linkBase
+ return cls._make_brightcove_url(params)
+
+ @classmethod
+ def _build_brighcove_url_from_js(cls, object_js):
+ # The layout of JS is as follows:
+ # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+ # // build Brightcove <object /> XML
+ # }
+ m = re.search(
+ r'''(?x)customBC.\createVideo\(
+ .*? # skipping width and height
+ ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID
+ ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters
+ # in length, however it's appended to itself
+ # in places, so truncate
+ ["\'](?P<videoID>\d+)["\'] # @videoPlayer
+ ''', object_js)
+ if m:
+ return cls._make_brightcove_url(m.groupdict())
+
+ @classmethod
+ def _make_brightcove_url(cls, params):
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
@@ -172,7 +195,7 @@ class BrightcoveIE(InfoExtractor):
"""Return a list of all Brightcove URLs from the webpage """
url_m = re.search(
- r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+ r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
webpage)
if url_m:
url = unescapeHTML(url_m.group(1))
@@ -188,7 +211,12 @@ class BrightcoveIE(InfoExtractor):
[^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
).+?>\s*</object>''',
webpage)
- return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+ if matches:
+ return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+ return list(filter(None, [
+ cls._build_brighcove_url_from_js(custom_bc)
+ for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
index 6252be05b..3b2de517e 100644
--- a/youtube_dl/extractor/byutv.py
+++ b/youtube_dl/extractor/byutv.py
@@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor):
'ext': 'mp4',
'description': 'md5:5438d33774b6bdc662f9485a340401cc',
'title': 'Season 5 Episode 5',
- 'thumbnail': 're:^https?://.*promo.*'
+ 'thumbnail': 're:^https?://.*\.jpg$'
},
'params': {
'skip_download': True,
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 1b14471e5..57e0cda2c 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -25,14 +25,14 @@ class CanalplusIE(InfoExtractor):
}
_TESTS = [{
- 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
- 'md5': '3db39fb48b9685438ecf33a1078023e4',
+ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
+ 'md5': 'b3481d7ca972f61e37420798d0a9d934',
'info_dict': {
- 'id': '922470',
+ 'id': '1263092',
'ext': 'flv',
- 'title': 'Zapping - 26/08/13',
- 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
- 'upload_date': '20130826',
+ 'title': 'Le Zapping - 13/05/15',
+ 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
+ 'upload_date': '20150513',
},
}, {
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -56,7 +56,7 @@ class CanalplusIE(InfoExtractor):
'skip': 'videos get deleted after a while',
}, {
'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
- 'md5': '65aa83ad62fe107ce29e564bb8712580',
+ 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4',
'info_dict': {
'id': '1213714',
'ext': 'flv',
@@ -106,15 +106,11 @@ class CanalplusIE(InfoExtractor):
continue
format_id = fmt.tag
if format_id == 'HLS':
- hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv')
- for fmt in hls_formats:
- fmt['preference'] = preference(format_id)
- formats.extend(hls_formats)
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', preference=preference(format_id)))
elif format_id == 'HDS':
- hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id)
- for fmt in hds_formats:
- fmt['preference'] = preference(format_id)
- formats.extend(hds_formats)
+ formats.extend(self._extract_f4m_formats(
+ format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id)))
else:
formats.append({
'url': format_url,
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index 1ceb9d8d9..75fffb156 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -4,12 +4,13 @@ from .common import InfoExtractor
class CBSIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*'
+ _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '4JUVEwq3wUT7',
+ 'display_id': 'connect-chat-feat-garth-brooks',
'ext': 'flv',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -24,6 +25,7 @@ class CBSIE(InfoExtractor):
'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
'info_dict': {
'id': 'WWF_5KqY3PK1',
+ 'display_id': 'st-vincent',
'ext': 'flv',
'title': 'Live on Letterman - St. Vincent',
'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
@@ -34,12 +36,23 @@ class CBSIE(InfoExtractor):
'skip_download': True,
},
'_skip': 'Blocked outside the US',
+ }, {
+ 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
real_id = self._search_regex(
- r"video\.settings\.pid\s*=\s*'([^']+)';",
+ [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
webpage, 'real video ID')
- return self.url_result('theplatform:%s' % real_id)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': 'theplatform:%s' % real_id,
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 7e47960ab..52e61d85b 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor):
'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
'ext': 'flv',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
- 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'duration': 205,
},
'params': {
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py
index 2a5d4be18..6924eac70 100644
--- a/youtube_dl/extractor/ccc.py
+++ b/youtube_dl/extractor/ccc.py
@@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):
_TEST = {
'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
- 'md5': '205a365d0d57c0b1e43a12c9ffe8f9be',
+ 'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': {
'id': '20131228183',
'ext': 'mp4',
@@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):
matches = re.finditer(r'''(?xs)
<(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
- <a\s+href='(?P<http_url>[^']+)'>\s*
+ <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
(?:
.*?
<a\s+href='(?P<torrent_url>[^']+\.torrent)'
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index 65f6be623..dda583680 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_request,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
from ..utils import (
@@ -88,7 +89,7 @@ class CeskaTelevizeIE(InfoExtractor):
if playlist_url == 'error_region':
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
- req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))
+ req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url))
req.add_header('Referer', url)
playlist = self._download_json(req, video_id)
diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py
index c922f6959..0206d96db 100644
--- a/youtube_dl/extractor/chilloutzone.py
+++ b/youtube_dl/extractor/chilloutzone.py
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
- decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+ decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index cf0a7551b..fd1770dac 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import ExtractorError
from .bliptv import BlipTVIE
+from .screenwavemedia import ScreenwaveMediaIE
class CinemassacreIE(InfoExtractor):
@@ -60,6 +61,17 @@ class CinemassacreIE(InfoExtractor):
'uploader_id': 'Cinemassacre',
'title': 'AVGN: McKids',
}
+ },
+ {
+ 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+ 'md5': '1376908e49572389e7b06251a53cdd08',
+ 'info_dict': {
+ 'id': 'Cinemassacre-555779690c440',
+ 'ext': 'mp4',
+ 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+ 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+ 'upload_date': '20150525',
+ }
}
]
@@ -72,10 +84,10 @@ class CinemassacreIE(InfoExtractor):
playerdata_url = self._search_regex(
[
- r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
- r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
+ ScreenwaveMediaIE.EMBED_PATTERN,
+ r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
],
- webpage, 'player data URL', default=None)
+ webpage, 'player data URL', default=None, group='url')
if not playerdata_url:
playerdata_url = BlipTVIE._extract_url(webpage)
if not playerdata_url:
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index a5c3cb7c6..7af903571 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,53 +1,68 @@
from __future__ import unicode_literals
import re
-import time
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
- parse_duration,
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ parse_iso8601,
+ remove_end,
)
class ClipfishIE(InfoExtractor):
- IE_NAME = 'clipfish'
-
- _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
- 'md5': '2521cd644e862936cf2e698206e47385',
+ 'md5': '79bc922f3e8a9097b3d68a93780fd475',
'info_dict': {
'id': '3966754',
'ext': 'mp4',
'title': 'FIFA 14 - E3 2013 Trailer',
+ 'timestamp': 1370938118,
+ 'upload_date': '20130611',
'duration': 82,
- },
- 'skip': 'Blocked in the US'
+ }
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
- info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
- (video_id, int(time.time())))
- doc = self._download_xml(
- info_url, video_id, note='Downloading info page')
- title = doc.find('title').text
- video_url = doc.find('filename').text
- if video_url is None:
- xml_bytes = xml.etree.ElementTree.tostring(doc)
- raise ExtractorError('Cannot find video URL in document %r' %
- xml_bytes)
- thumbnail = doc.find('imageurl').text
- duration = parse_duration(doc.find('duration').text)
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_info = self._parse_json(
+ js_to_json(self._html_search_regex(
+ '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')),
+ video_id)
+
+ formats = []
+ for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage):
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.append({
+ 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
+ 'ext': 'mp4',
+ 'format_id': 'hls',
+ })
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': ext,
+ })
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' - Video')
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(video_info.get('length'))
+ timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date'))
return {
'id': video_id,
'title': title,
- 'url': video_url,
+ 'formats': formats,
'thumbnail': thumbnail,
'duration': duration,
+ 'timestamp': timestamp,
}
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index d07d544ea..8306d6fb7 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
@@ -10,9 +8,9 @@ from ..utils import (
class ClipsyndicateIE(InfoExtractor):
- _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+ _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
'md5': '4d7d549451bad625e0ff3d7bd56d776c',
'info_dict': {
@@ -22,11 +20,13 @@ class ClipsyndicateIE(InfoExtractor):
'duration': 612,
'thumbnail': 're:^https?://.+\.jpg',
},
- }
+ }, {
+ 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
video_id, 'Downlaoding player')
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 3145b3051..5dd69bff7 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -11,7 +11,7 @@ from ..utils import (
class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
@@ -25,7 +25,20 @@ class CNETIE(InfoExtractor):
'params': {
'skip_download': 'requires rtmpdump',
}
- }
+ }, {
+ 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+ 'info_dict': {
+ 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
+ 'ext': 'flv',
+ 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+ 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+ 'uploader': 'Ashley Esqueda',
+ 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -42,7 +55,7 @@ class CNETIE(InfoExtractor):
raise ExtractorError('Cannot find video data')
mpx_account = data['config']['players']['default']['mpx_account']
- vid = vdata['files']['rtmp']
+ vid = vdata['files'].get('rtmp', vdata['files']['hds'])
tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
video_id = vdata['id']
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 5efc5f4fe..3b1bd4033 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -12,7 +12,7 @@ from ..utils import (
class CNNIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
- (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py
index 9c25b2223..81f3d7697 100644
--- a/youtube_dl/extractor/comcarcoff.py
+++ b/youtube_dl/extractor/comcarcoff.py
@@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor):
webpage, 'full data json'))
video_id = full_data['activeVideo']['video']
- video_data = full_data['videos'][video_id]
+ video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id]
thumbnails = [{
'url': video_data['images']['thumb'],
}, {
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 3ae5d5212..39cef9c5b 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,26 +14,35 @@ import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
+ compat_cookies,
+ compat_getpass,
compat_HTTPError,
compat_http_client,
compat_urllib_error,
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
+ compat_urllib_request,
compat_urlparse,
compat_str,
)
from ..utils import (
+ NO_DEFAULT,
age_restricted,
bug_reports_message,
clean_html,
compiled_regex_type,
+ determine_ext,
ExtractorError,
+ fix_xml_ampersands,
float_or_none,
int_or_none,
RegexNotFoundError,
sanitize_filename,
unescapeHTML,
+ url_basename,
+ xpath_text,
+ xpath_with_ns,
)
-_NO_DEFAULT = object()
class InfoExtractor(object):
@@ -63,7 +72,7 @@ class InfoExtractor(object):
Potential fields:
* url Mandatory. The URL of the video file
- * ext Will be calculated from url if missing
+ * ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
Calculated from the format_id, width, height.
@@ -153,7 +162,7 @@ class InfoExtractor(object):
lower to higher preference, each element is a dictionary
with the "ext" entry and one of:
* "data": The subtitles file contents
- * "url": A url pointing to the subtitles file
+ * "url": A URL pointing to the subtitles file
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer.
@@ -174,13 +183,18 @@ class InfoExtractor(object):
Set to "root" to indicate that this is a
comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
- webpage_url: The url to the video webpage, if given to youtube-dl it
+ webpage_url: The URL to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
+ start_time: Time in seconds where the reproduction should start, as
+ specified in the URL.
+ end_time: Time in seconds where the reproduction should end, as
+ specified in the URL.
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -191,8 +205,8 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification.
- Additionally, playlists can have "title" and "id" attributes with the same
- semantics as videos (see above).
+ Additionally, playlists can have "title", "description" and "id" attributes
+ with the same semantics as videos (see above).
_type "multi_video" indicates that there are multiple videos that
@@ -496,10 +510,16 @@ class InfoExtractor(object):
"""Report attempt to log in."""
self.to_screen('Logging in')
+ @staticmethod
+ def raise_login_required(msg='This video is only available for registered users'):
+ raise ExtractorError(
+ '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+ expected=True)
+
# Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None, video_title=None):
- """Returns a url that points to a page that should be processed"""
+ """Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
@@ -523,7 +543,7 @@ class InfoExtractor(object):
video_info['description'] = playlist_description
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -549,7 +569,7 @@ class InfoExtractor(object):
return next(g for g in mobj.groups() if g is not None)
else:
return mobj.group(group)
- elif default is not _NO_DEFAULT:
+ elif default is not NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
@@ -557,7 +577,7 @@ class InfoExtractor(object):
self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
@@ -597,7 +617,7 @@ class InfoExtractor(object):
return (username, password)
- def _get_tfa_info(self):
+ def _get_tfa_info(self, note='two-factor verification code'):
"""
Get the two-factor authentication info
TODO - asking the user will be required for sms/phone verify
@@ -611,7 +631,7 @@ class InfoExtractor(object):
if downloader_params.get('twofactor', None) is not None:
return downloader_params['twofactor']
- return None
+ return compat_getpass('Type %s and press [Return]: ' % note)
# Helper functions for extracting OpenGraph info
@staticmethod
@@ -624,6 +644,12 @@ class InfoExtractor(object):
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -633,7 +659,7 @@ class InfoExtractor(object):
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
- return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+ return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
@@ -654,9 +680,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -705,6 +729,27 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
+ @staticmethod
+ def _hidden_inputs(html):
+ hidden_inputs = {}
+ for input in re.findall(r'<input([^>]+)>', html):
+ if not re.search(r'type=(["\'])hidden\1', input):
+ continue
+ name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+ if not name:
+ continue
+ value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+ if not value:
+ continue
+ hidden_inputs[name.group('value')] = value.group('value')
+ return hidden_inputs
+
+ def _form_hidden_inputs(self, form_id, html):
+ form = self._search_regex(
+ r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+ html, '%s form' % form_id, group='form')
+ return self._hidden_inputs(form)
+
def _sort_formats(self, formats, field_preference=None):
if not formats:
raise ExtractorError('No video formats found')
@@ -764,7 +809,7 @@ class InfoExtractor(object):
f.get('fps') if f.get('fps') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
f.get('source_preference') if f.get('source_preference') is not None else -1,
- f.get('format_id'),
+ f.get('format_id') if f.get('format_id') is not None else '',
)
formats.sort(key=_formats_key)
@@ -786,8 +831,8 @@ class InfoExtractor(object):
return True
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
- self.report_warning(
- '%s URL is invalid, skipping' % item, video_id)
+ self.to_screen(
+ '%s: %s URL is invalid, skipping' % (video_id, item))
return False
raise
@@ -815,10 +860,14 @@ class InfoExtractor(object):
self.to_screen(msg)
time.sleep(timeout)
- def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip()):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
- 'Unable to download f4m manifest')
+ 'Unable to download f4m manifest',
+ # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+ # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+ transform_source=transform_source)
formats = []
manifest_version = '1.0'
@@ -828,8 +877,19 @@ class InfoExtractor(object):
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
- manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
- (media_el.attrib.get('href') or media_el.attrib.get('url')))
+ media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ if not media_url:
+ continue
+ manifest_url = (
+ media_url if media_url.startswith('http://') or media_url.startswith('https://')
+ else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+ # If media_url is itself a f4m manifest do the recursive extraction
+ # since bitrates in parent manifest (this one) and media_url manifest
+ # may differ leading to inability to resolve the format by requested
+ # bitrate in f4m downloader
+ if determine_ext(manifest_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+ continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
@@ -846,7 +906,8 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
- m3u8_id=None):
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True):
formats = [{
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
@@ -865,8 +926,11 @@ class InfoExtractor(object):
m3u8_doc = self._download_webpage(
m3u8_url, video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information',
+ fatal=fatal)
+ if m3u8_doc is False:
+ return m3u8_doc
last_info = None
last_media = None
kv_rex = re.compile(
@@ -896,7 +960,7 @@ class InfoExtractor(object):
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- last_media_name = last_media.get('NAME') if last_media else None
+ last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
f = {
'format_id': '-'.join(format_id),
@@ -927,69 +991,221 @@ class InfoExtractor(object):
self._sort_formats(formats)
return formats
- # TODO: improve extraction
- def _extract_smil_formats(self, smil_url, video_id, fatal=True):
- smil = self._download_xml(
- smil_url, video_id, 'Downloading SMIL file',
- 'Unable to download SMIL file', fatal=fatal)
+ @staticmethod
+ def _xpath_ns(path, namespace=None):
+ if not namespace:
+ return path
+ out = []
+ for c in path.split('/'):
+ if not c or c == '.':
+ out.append(c)
+ else:
+ out.append('{%s}%s' % (namespace, c))
+ return '/'.join(out)
+
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
if smil is False:
assert not fatal
return []
- base = smil.find('./head/meta').get('base')
+ namespace = self._parse_smil_namespace(smil)
+
+ return self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ if smil is False:
+ return {}
+ return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+ def _download_smil(self, smil_url, video_id, fatal=True):
+ return self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file', fatal=fatal)
+
+ def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+ namespace = self._parse_smil_namespace(smil)
+
+ formats = self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+ subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+ video_id = os.path.splitext(url_basename(smil_url))[0]
+ title = None
+ description = None
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ name = meta.attrib.get('name')
+ content = meta.attrib.get('content')
+ if not name or not content:
+ continue
+ if not title and name == 'title':
+ title = content
+ elif not description and name in ('description', 'abstract'):
+ description = content
+
+ return {
+ 'id': video_id,
+ 'title': title or video_id,
+ 'description': description,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _parse_smil_namespace(self, smil):
+ return self._search_regex(
+ r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ base = smil_url
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ b = meta.get('base') or meta.get('httpBase')
+ if b:
+ base = b
+ break
formats = []
rtmp_count = 0
- if smil.findall('./body/seq/video'):
- video = smil.findall('./body/seq/video')[0]
- fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
- formats.extend(fmts)
- else:
- for video in smil.findall('./body/switch/video'):
- fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
- formats.extend(fmts)
+ http_count = 0
+
+ videos = smil.findall(self._xpath_ns('.//video', namespace))
+ for video in videos:
+ src = video.get('src')
+ if not src:
+ continue
+
+ bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ filesize = int_or_none(video.get('size') or video.get('fileSize'))
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ proto = video.get('proto')
+ ext = video.get('ext')
+ src_ext = determine_ext(src)
+ streamer = video.get('streamer') or base
+
+ if proto == 'rtmp' or streamer.startswith('rtmp'):
+ rtmp_count += 1
+ formats.append({
+ 'url': streamer,
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+ if transform_rtmp_url:
+ streamer, src = transform_rtmp_url(streamer, src)
+ formats[-1].update({
+ 'url': streamer,
+ 'play_path': src,
+ })
+ continue
+
+ src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+ if proto == 'm3u8' or src_ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+ continue
+
+ if src_ext == 'f4m':
+ f4m_url = src_url
+ if not f4m_params:
+ f4m_params = {
+ 'hdcore': '3.2.0',
+ 'plugin': 'flowplayer-3.2.0.1',
+ }
+ f4m_url += '&' if '?' in f4m_url else '?'
+ f4m_url += compat_urllib_parse.urlencode(f4m_params)
+ formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+ continue
+
+ if src_url.startswith('http'):
+ http_count += 1
+ formats.append({
+ 'url': src_url,
+ 'ext': ext or src_ext or 'flv',
+ 'format_id': 'http-%d' % (bitrate or http_count),
+ 'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+ continue
self._sort_formats(formats)
return formats
- def _parse_smil_video(self, video, video_id, base, rtmp_count):
- src = video.get('src')
- if not src:
- return ([], rtmp_count)
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- proto = video.get('proto')
- if not proto:
- if base:
- if base.startswith('rtmp'):
- proto = 'rtmp'
- elif base.startswith('http'):
- proto = 'http'
- ext = video.get('ext')
- if proto == 'm3u8':
- return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count)
- elif proto == 'rtmp':
- rtmp_count += 1
- streamer = video.get('streamer') or base
- return ([{
- 'url': streamer,
- 'play_path': src,
- 'ext': 'flv',
- 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
- 'tbr': bitrate,
- 'width': width,
- 'height': height,
- }], rtmp_count)
- elif proto.startswith('http'):
- return ([{
- 'url': base + src,
- 'ext': ext or 'flv',
- 'tbr': bitrate,
- 'width': width,
- 'height': height,
- }], rtmp_count)
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ subtitles = {}
+ for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+ src = textstream.get('src')
+ if not src:
+ continue
+ ext = textstream.get('ext') or determine_ext(src)
+ if not ext:
+ type_ = textstream.get('type')
+ SUBTITLES_TYPES = {
+ 'text/vtt': 'vtt',
+ 'text/srt': 'srt',
+ 'application/smptett+xml': 'tt',
+ }
+ if type_ in SUBTITLES_TYPES:
+ ext = SUBTITLES_TYPES[type_]
+ lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+ subtitles.setdefault(lang, []).append({
+ 'url': src,
+ 'ext': ext,
+ })
+ return subtitles
+
+ def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+ xspf = self._download_xml(
+ playlist_url, playlist_id, 'Downloading xpsf playlist',
+ 'Unable to download xspf manifest', fatal=fatal)
+ if xspf is False:
+ return []
+ return self._parse_xspf(xspf, playlist_id)
+
+ def _parse_xspf(self, playlist, playlist_id):
+ NS_MAP = {
+ 'xspf': 'http://xspf.org/ns/0/',
+ 's1': 'http://static.streamone.nl/player/ns/0',
+ }
+
+ entries = []
+ for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ title = xpath_text(
+ track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+ description = xpath_text(
+ track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+ thumbnail = xpath_text(
+ track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+ duration = float_or_none(
+ xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+ formats = [{
+ 'url': location.text,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': playlist_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ })
+ return entries
def _live_title(self, name):
""" Generate the title for a live video """
@@ -1025,6 +1241,12 @@ class InfoExtractor(object):
None, '/', True, False, expire_time, '', None, None, None)
self._downloader.cookiejar.set_cookie(cookie)
+ def _get_cookies(self, url):
+ """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ req = compat_urllib_request.Request(url)
+ self._downloader.cookiejar.add_cookie_header(req)
+ return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
if t:
@@ -1063,6 +1285,23 @@ class InfoExtractor(object):
def _get_subtitles(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses")
+ @staticmethod
+ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+ """ Merge subtitle items for one language. Items with duplicated URLs
+ will be dropped. """
+ list1_urls = set([item['url'] for item in subtitle_list1])
+ ret = list(subtitle_list1)
+ ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+ return ret
+
+ @classmethod
+ def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+ """ Merge two subtitle dictionaries, language by language. """
+ ret = dict(subtitle_dict1)
+ for lang in subtitle_dict2:
+ ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+ return ret
+
def extract_automatic_captions(self, *args, **kwargs):
if (self._downloader.params.get('writeautomaticsub', False) or
self._downloader.params.get('listsubtitles')):
@@ -1072,14 +1311,11 @@ class InfoExtractor(object):
def _get_automatic_captions(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses")
- def _subtitles_timecode(self, seconds):
- return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
-
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
- They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+ They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
Instances should define _SEARCH_KEY and _MAX_RESULTS.
"""
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 1c77df47e..c2162aa68 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -12,12 +12,15 @@ from math import pow, sqrt, floor
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
bytes_to_intlist,
intlist_to_bytes,
+ remove_end,
unified_strdate,
urlencode_postdata,
)
@@ -27,7 +30,7 @@ from ..aes import (
class CrunchyrollIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_NETRC_MACHINE = 'crunchyroll'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -46,6 +49,22 @@ class CrunchyrollIE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+ 'info_dict': {
+ 'id': '589804',
+ 'ext': 'flv',
+ 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+ 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Danny Choo Network',
+ 'upload_date': '20120213',
+ },
+ 'params': {
+ # rtmp
+ 'skip_download': True,
+ },
+
+ }, {
'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
'only_matching': True,
}]
@@ -76,8 +95,8 @@ class CrunchyrollIE(InfoExtractor):
self._login()
def _decrypt_subtitles(self, data, iv, id):
- data = bytes_to_intlist(data)
- iv = bytes_to_intlist(iv)
+ data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+ iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
id = int(id)
def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +198,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output
+ def _extract_subtitles(self, subtitle):
+ sub_root = xml.etree.ElementTree.fromstring(subtitle)
+ return [{
+ 'ext': 'srt',
+ 'data': self._convert_subtitles_to_srt(sub_root),
+ }, {
+ 'ext': 'ass',
+ 'data': self._convert_subtitles_to_ass(sub_root),
+ }]
+
def _get_subtitles(self, video_id, webpage):
subtitles = {}
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +219,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
if not id or not iv or not data:
continue
- id = int(id)
- iv = base64.b64decode(iv)
- data = base64.b64decode(data)
-
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
- sub_root = xml.etree.ElementTree.fromstring(subtitle)
- subtitles[lang_code] = [
- {
- 'ext': 'srt',
- 'data': self._convert_subtitles_to_srt(sub_root),
- },
- {
- 'ext': 'ass',
- 'data': self._convert_subtitles_to_ass(sub_root),
- },
- ]
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
return subtitles
def _real_extract(self, url):
@@ -222,7 +237,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
webpage_url = 'http://www.' + mobj.group('url')
webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
- note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='')
+ note_m = self._html_search_regex(
+ r'<div class="showmedia-trailer-notice">(.+?)</div>',
+ webpage, 'trailer-notice', default='')
if note_m:
raise ExtractorError(note_m)
@@ -232,6 +249,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
if msg.get('type') == 'error':
raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
+ if 'To view this, please log in to verify you are 18 or older.' in webpage:
+ self.raise_login_required()
+
video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
video_title = re.sub(r' {2,}', ' ', video_title)
video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
@@ -242,7 +262,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
video_upload_date = unified_strdate(video_upload_date)
video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
- playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
+ playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
playerdata_req = compat_urllib_request.Request(playerdata_url)
playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -255,16 +275,31 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p'
- streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
- # urlencode doesn't work!
- streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
+ streamdata_req = compat_urllib_request.Request(
+ 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
+ % (stream_id, stream_format, stream_quality),
+ compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
streamdata = self._download_xml(
streamdata_req, video_id,
note='Downloading media info for %s' % video_format)
- video_url = streamdata.find('./host').text
- video_play_path = streamdata.find('./file').text
+ stream_info = streamdata.find('./{default}preload/stream_info')
+ video_url = stream_info.find('./host').text
+ video_play_path = stream_info.find('./file').text
+
+ if '.fplive.net/' in video_url:
+ video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+ parsed_video_url = compat_urlparse.urlparse(video_url)
+ direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+ netloc='v.lvlt.crcdn.net',
+ path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1])))
+ if self._is_valid_url(direct_video_url, video_id, video_format):
+ formats.append({
+ 'url': direct_video_url,
+ 'format_id': video_format,
+ })
+ continue
+
formats.append({
'url': video_url,
'play_path': video_play_path,
diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py
index 0226f8036..45049bf37 100644
--- a/youtube_dl/extractor/ctsnews.py
+++ b/youtube_dl/extractor/ctsnews.py
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601, ExtractorError
class CtsNewsIE(InfoExtractor):
+ IE_DESC = '華視新聞'
# https connection failed (Connection reset)
_VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
_TESTS = [{
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 7615ecd4b..2d90b2224 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -13,8 +13,9 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ determine_ext,
int_or_none,
- orderedSet,
+ parse_iso8601,
str_to_int,
unescapeHTML,
)
@@ -28,10 +29,16 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
request.add_header('Cookie', 'family_filter=off; ff=off')
return request
+ def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
+ request = self._build_request(url)
+ return self._download_webpage_handle(request, *args, **kwargs)
-class DailymotionIE(DailymotionBaseInfoExtractor):
- """Information Extractor for Dailymotion"""
+ def _download_webpage_no_ff(self, url, *args, **kwargs):
+ request = self._build_request(url)
+ return self._download_webpage(request, *args, **kwargs)
+
+class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion'
@@ -50,8 +57,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'info_dict': {
'id': 'x2iuewm',
'ext': 'mp4',
- 'uploader': 'IGN',
'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+ 'description': 'Several come bundled with the Steam Controller.',
+ 'thumbnail': 're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 74,
+ 'timestamp': 1425657362,
+ 'upload_date': '20150306',
+ 'uploader': 'IGN',
+ 'uploader_id': 'xijv66',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'comment_count': int,
}
},
# Vevo video
@@ -85,38 +101,106 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- url = 'http://www.dailymotion.com/video/%s' % video_id
- # Retrieve video webpage to extract further information
- request = self._build_request(url)
- webpage = self._download_webpage(request, video_id)
+ webpage = self._download_webpage_no_ff(
+ 'https://www.dailymotion.com/video/%s' % video_id, video_id)
+
+ age_limit = self._rta_search(webpage)
- # Extract URL, uploader and title from webpage
- self.report_extraction(video_id)
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
- # It may just embed a vevo video:
- m_vevo = re.search(
+ view_count = str_to_int(self._search_regex(
+ [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"',
+ r'video_views_count[^>]+>\s+([\d\.,]+)'],
+ webpage, 'view count', fatal=False))
+ comment_count = int_or_none(self._search_regex(
+ r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
+ webpage, 'comment count', fatal=False))
+
+ player_v5 = self._search_regex(
+ r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
+ webpage, 'player v5', default=None)
+ if player_v5:
+ player = self._parse_json(player_v5, video_id)
+ metadata = player['metadata']
+ formats = []
+ for quality, media_list in metadata['qualities'].items():
+ for media in media_list:
+ media_url = media.get('url')
+ if not media_url:
+ continue
+ type_ = media.get('type')
+ if type_ == 'application/vnd.lumberjack.manifest':
+ continue
+ if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', m3u8_id='hls'))
+ else:
+ f = {
+ 'url': media_url,
+ 'format_id': quality,
+ }
+ m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
+ if m:
+ f.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ title = metadata['title']
+ duration = int_or_none(metadata.get('duration'))
+ timestamp = int_or_none(metadata.get('created_time'))
+ thumbnail = metadata.get('poster_url')
+ uploader = metadata.get('owner', {}).get('screenname')
+ uploader_id = metadata.get('owner', {}).get('id')
+
+ subtitles = {}
+ for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items():
+ subtitles[subtitle_lang] = [{
+ 'ext': determine_ext(subtitle_url),
+ 'url': subtitle_url,
+ } for subtitle_url in subtitle.get('urls', [])]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'age_limit': age_limit,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ # vevo embed
+ vevo_id = self._search_regex(
r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)',
- webpage)
- if m_vevo is not None:
- vevo_id = m_vevo.group('id')
- self.to_screen('Vevo video detected: %s' % vevo_id)
- return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+ webpage, 'vevo embed', default=None)
+ if vevo_id:
+ return self.url_result('vevo:%s' % vevo_id, 'Vevo')
- age_limit = self._rta_search(webpage)
+ # fallback old player
+ embed_page = self._download_webpage_no_ff(
+ 'https://www.dailymotion.com/embed/video/%s' % video_id,
+ video_id, 'Downloading embed page')
+
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'video:release_date', webpage, 'upload date'))
+
+ info = self._parse_json(
+ self._search_regex(
+ r'var info = ({.*?}),$', embed_page,
+ 'video info', flags=re.MULTILINE),
+ video_id)
- video_upload_date = None
- mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
- if mobj is not None:
- video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
-
- embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
- embed_request = self._build_request(embed_url)
- embed_page = self._download_webpage(
- embed_request, video_id, 'Downloading embed page')
- info = self._search_regex(r'var info = ({.*?}),$', embed_page,
- 'video info', flags=re.MULTILINE)
- info = json.loads(info)
if info.get('error') is not None:
msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
raise ExtractorError(msg, expected=True)
@@ -137,16 +221,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'width': width,
'height': height,
})
- if not formats:
- raise ExtractorError('Unable to extract video URL')
+ self._sort_formats(formats)
# subtitles
video_subtitles = self.extract_subtitles(video_id, webpage)
- view_count = str_to_int(self._search_regex(
- r'video_views_count[^>]+>\s+([\d\.,]+)',
- webpage, 'view count', fatal=False))
-
title = self._og_search_title(webpage, default=None)
if title is None:
title = self._html_search_regex(
@@ -157,12 +236,14 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'id': video_id,
'formats': formats,
'uploader': info['owner.screenname'],
- 'upload_date': video_upload_date,
+ 'timestamp': timestamp,
'title': title,
+ 'description': description,
'subtitles': video_subtitles,
'thumbnail': info['thumbnail_url'],
'age_limit': age_limit,
'view_count': view_count,
+ 'duration': info['duration']
}
def _get_subtitles(self, video_id, webpage):
@@ -196,18 +277,26 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
}]
def _extract_entries(self, id):
- video_ids = []
+ video_ids = set()
+ processed_urls = set()
for pagenum in itertools.count(1):
- request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum))
- webpage = self._download_webpage(request,
- id, 'Downloading page %s' % pagenum)
+ page_url = self._PAGE_TEMPLATE % (id, pagenum)
+ webpage, urlh = self._download_webpage_handle_no_ff(
+ page_url, id, 'Downloading page %s' % pagenum)
+ if urlh.geturl() in processed_urls:
+ self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
+ page_url, urlh.geturl()), id)
+ break
+
+ processed_urls.add(urlh.geturl())
- video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
+ for video_id in re.findall(r'data-xid="(.+?)"', webpage):
+ if video_id not in video_ids:
+ yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+ video_ids.add(video_id)
if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
- return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
- for video_id in orderedSet(video_ids)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -224,7 +313,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = 'dailymotion:user'
- _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
@@ -233,12 +322,24 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'title': 'Rémi Gaillard',
},
'playlist_mincount': 100,
+ }, {
+ 'url': 'http://www.dailymotion.com/user/UnderProject',
+ 'info_dict': {
+ 'id': 'UnderProject',
+ 'title': 'UnderProject',
+ },
+ 'playlist_mincount': 1800,
+ 'expected_warnings': [
+ 'Stopped at duplicated page',
+ ],
+ 'skip': 'Takes too long time',
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
- webpage = self._download_webpage(url, user)
+ webpage = self._download_webpage(
+ 'https://www.dailymotion.com/user/%s' % user, user)
full_user = unescapeHTML(self._html_search_regex(
r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
webpage, 'user'))
@@ -249,3 +350,52 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'title': full_user,
'entries': self._extract_entries(user),
}
+
+
+class DailymotionCloudIE(DailymotionBaseInfoExtractor):
+ _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/'
+ _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX
+ _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX
+
+ _TESTS = [{
+ # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
+ # Tested at FranceTvInfo_2
+ 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
+ 'only_matching': True,
+ }, {
+ # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html
+ 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_dmcloud_url(self, webpage):
+ mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage)
+ if mobj:
+ return mobj.group(1)
+
+ mobj = re.search(
+ r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL,
+ webpage)
+ if mobj:
+ return mobj.group(1)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage_no_ff(url, video_id)
+
+ title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
+
+ video_info = self._parse_json(self._search_regex(
+ r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
+
+ # TODO: parse ios_url, which is in fact a manifest
+ video_url = video_info['mp4_url']
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': video_info.get('thumbnail_url'),
+ }
diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py
new file mode 100644
index 000000000..82261e25c
--- /dev/null
+++ b/youtube_dl/extractor/dcn.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class DCNIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887',
+ 'info_dict':
+ {
+ 'id': '17375',
+ 'ext': 'mp4',
+ 'title': 'رحلة العمر : الحلقة 1',
+ 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 2041,
+ 'timestamp': 1227504126,
+ 'upload_date': '20081124',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ request = compat_urllib_request.Request(
+ 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
+ headers={'Origin': 'http://www.dcndigital.ae'})
+
+ video = self._download_json(request, video_id)
+ title = video.get('title_en') or video['title_ar']
+
+ webpage = self._download_webpage(
+ 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?'
+ + compat_urllib_parse.urlencode({
+ 'id': video['id'],
+ 'user_id': video['user_id'],
+ 'signature': video['signature'],
+ 'countries': 'Q0M=',
+ 'filter': 'DENY',
+ }), video_id)
+
+ m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+
+ rtsp_url = self._search_regex(
+ r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
+ if rtsp_url:
+ formats.append({
+ 'url': rtsp_url,
+ 'format_id': 'rtsp',
+ })
+
+ self._sort_formats(formats)
+
+ img = video.get('img')
+ thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None
+ duration = int_or_none(video.get('duration'))
+ description = video.get('description_en') or video.get('description_ar')
+ timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py
index 8049779b0..263532cc6 100644
--- a/youtube_dl/extractor/dfb.py
+++ b/youtube_dl/extractor/dfb.py
@@ -3,42 +3,47 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import unified_strdate
class DFBIE(InfoExtractor):
IE_NAME = 'tv.dfb.de'
- _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)'
+ _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)'
_TEST = {
- 'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/',
+ 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
# The md5 is different each time
'info_dict': {
- 'id': '9070',
+ 'id': '11633',
+ 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
'ext': 'flv',
- 'title': 'Highlights des Empfangs in Berlin',
- 'upload_date': '20140716',
+ 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
+ 'upload_date': '20150714',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, display_id)
player_info = self._download_xml(
'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
- video_id)
+ display_id)
video_info = player_info.find('video')
- f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
+ f4m_info = self._download_xml(
+ self._proto_relative_url(video_info.find('url').text.strip()), display_id)
token_el = f4m_info.find('token')
manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
+ formats = self._extract_f4m_formats(manifest_url, display_id)
return {
'id': video_id,
+ 'display_id': display_id,
'title': video_info.find('title').text,
- 'url': manifest_url,
- 'ext': 'flv',
'thumbnail': self._og_search_thumbnail(webpage),
- 'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]),
+ 'upload_date': unified_strdate(video_info.find('time_date').text),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py
index 3ed1f1663..44e0c5d4d 100644
--- a/youtube_dl/extractor/dhm.py
+++ b/youtube_dl/extractor/dhm.py
@@ -1,10 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- xpath_text,
- parse_duration,
-)
+from ..utils import parse_duration
class DHMIE(InfoExtractor):
@@ -34,24 +31,14 @@ class DHMIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, playlist_id)
playlist_url = self._search_regex(
r"file\s*:\s*'([^']+)'", webpage, 'playlist url')
- playlist = self._download_xml(playlist_url, video_id)
-
- track = playlist.find(
- './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track')
-
- video_url = xpath_text(
- track, './{http://xspf.org/ns/0/}location',
- 'video url', fatal=True)
- thumbnail = xpath_text(
- track, './{http://xspf.org/ns/0/}image',
- 'thumbnail')
+ entries = self._extract_xspf_playlist(playlist_url, playlist_id)
title = self._search_regex(
[r'dc:title="([^"]+)"', r'<title> &raquo;([^<]+)</title>'],
@@ -63,11 +50,10 @@ class DHMIE(InfoExtractor):
r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)',
webpage, 'duration', default=None))
- return {
- 'id': video_id,
- 'url': video_url,
+ entries[0].update({
'title': title,
'description': description,
'duration': duration,
- 'thumbnail': thumbnail,
- }
+ })
+
+ return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index d3e667528..d6723ecf2 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -2,19 +2,19 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
+ parse_duration,
parse_iso8601,
- int_or_none,
)
+from ..compat import compat_str
class DiscoveryIE(InfoExtractor):
_VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
'info_dict': {
- 'id': 'mission-impossible-outtakes',
- 'ext': 'flv',
+ 'id': '20769',
+ 'ext': 'mp4',
'title': 'Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor):
'timestamp': 1303099200,
'upload_date': '20110418',
},
- }
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
+ }, {
+ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
+ 'info_dict': {
+ 'id': 'mythbusters-the-simpsons',
+ 'title': 'MythBusters: The Simpsons',
+ },
+ 'playlist_count': 9,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ info = self._download_json(url + '?flat=1', video_id)
- info = self._parse_json(self._search_regex(
- r'(?s)<script type="application/ld\+json">(.*?)</script>',
- webpage, 'video info'), video_id)
+ video_title = info.get('playlist_title') or info.get('video_title')
- return {
- 'id': video_id,
- 'title': info['name'],
- 'url': info['contentURL'],
- 'description': info.get('description'),
- 'thumbnail': info.get('thumbnailUrl'),
- 'timestamp': parse_iso8601(info.get('uploadDate')),
- 'duration': int_or_none(info.get('duration')),
- }
+ entries = [{
+ 'id': compat_str(video_info['id']),
+ 'formats': self._extract_m3u8_formats(
+ video_info['src'], video_id, ext='mp4',
+ note='Download m3u8 information for video %d' % (idx + 1)),
+ 'title': video_info['title'],
+ 'description': video_info.get('description'),
+ 'duration': parse_duration(video_info.get('video_length')),
+ 'webpage_url': video_info.get('href'),
+ 'thumbnail': video_info.get('thumbnailURL'),
+ 'alt_title': video_info.get('secondary_title'),
+ 'timestamp': parse_iso8601(video_info.get('publishedDate')),
+ } for idx, video_info in enumerate(info['playlist'])]
+
+ return self.playlist_result(entries, video_id, video_title)
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 479430c51..373b3b4b4 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -9,6 +9,7 @@ from ..compat import (compat_str, compat_basestring)
class DouyuTVIE(InfoExtractor):
+ IE_DESC = '斗鱼'
_VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://www.douyutv.com/iseven',
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
new file mode 100644
index 000000000..38e6597c8
--- /dev/null
+++ b/youtube_dl/extractor/dramafever.py
@@ -0,0 +1,216 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urllib_parse,
+ compat_urllib_request,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class DramaFeverBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
+ _NETRC_MACHINE = 'dramafever'
+
+ _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
+
+ _consumer_secret = None
+
+ def _get_consumer_secret(self):
+ mainjs = self._download_webpage(
+ 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js',
+ None, 'Downloading main.js', fatal=False)
+ if not mainjs:
+ return self._CONSUMER_SECRET
+ return self._search_regex(
+ r"var\s+cs\s*=\s*'([^']+)'", mainjs,
+ 'consumer secret', default=self._CONSUMER_SECRET)
+
+ def _real_initialize(self):
+ self._login()
+ self._consumer_secret = self._get_consumer_secret()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'username': username,
+ 'password': password,
+ }
+
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ if all(logout_pattern not in response
+ for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
+ error = self._html_search_regex(
+ r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class DramaFeverIE(DramaFeverBaseIE):
+ IE_NAME = 'dramafever'
+ _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
+ _TEST = {
+ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
+ 'info_dict': {
+ 'id': '4512.1',
+ 'ext': 'flv',
+ 'title': 'Cooking with Shin 4512.1',
+ 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1404336058,
+ 'upload_date': '20140702',
+ 'duration': 343,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url).replace('/', '.')
+
+ try:
+ feed = self._download_json(
+ 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
+ video_id, 'Downloading episode JSON')['channel']['item']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError):
+ raise ExtractorError(
+ 'Currently unavailable in your country.', expected=True)
+ raise
+
+ media_group = feed.get('media-group', {})
+
+ formats = []
+ for media_content in media_group['media-content']:
+ src = media_content.get('@attributes', {}).get('url')
+ if not src:
+ continue
+ ext = determine_ext(src)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src, video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ title = media_group.get('media-title')
+ description = media_group.get('media-description')
+ duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
+ thumbnail = self._proto_relative_url(
+ media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
+ timestamp = parse_iso8601(feed.get('pubDate'), ' ')
+
+ subtitles = {}
+ for media_subtitle in media_group.get('media-subTitle', []):
+ lang = media_subtitle.get('@attributes', {}).get('lang')
+ href = media_subtitle.get('@attributes', {}).get('href')
+ if not lang or not href:
+ continue
+ subtitles[lang] = [{
+ 'ext': 'ttml',
+ 'url': href,
+ }]
+
+ series_id, episode_number = video_id.split('.')
+ episode_info = self._download_json(
+ # We only need a single episode info, so restricting page size to one episode
+ # and dealing with page number as with episode number
+ r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1'
+ % (self._consumer_secret, series_id, episode_number),
+ video_id, 'Downloading episode info JSON', fatal=False)
+ if episode_info:
+ value = episode_info.get('value')
+ if value:
+ subfile = value[0].get('subfile') or value[0].get('new_subfile')
+ if subfile and subfile != 'http://www.dramafever.com/st/':
+ subtitles.setdefault('English', []).append({
+ 'ext': 'srt',
+ 'url': subfile,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class DramaFeverSeriesIE(DramaFeverBaseIE):
+ IE_NAME = 'dramafever:series'
+ _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
+ _TESTS = [{
+ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/',
+ 'info_dict': {
+ 'id': '4512',
+ 'title': 'Cooking with Shin',
+ 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'url': 'http://www.dramafever.com/drama/124/IRIS/',
+ 'info_dict': {
+ 'id': '124',
+ 'title': 'IRIS',
+ 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862',
+ },
+ 'playlist_count': 20,
+ }]
+
+ _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-)
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+
+ series = self._download_json(
+ 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s'
+ % (self._consumer_secret, series_id),
+ series_id, 'Downloading series JSON')['series'][series_id]
+
+ title = clean_html(series['name'])
+ description = clean_html(series.get('description') or series.get('description_short'))
+
+ entries = []
+ for page_num in itertools.count(1):
+ episodes = self._download_json(
+ 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d'
+ % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num),
+ series_id, 'Downloading episodes JSON page #%d' % page_num)
+ for episode in episodes.get('value', []):
+ episode_url = episode.get('episode_url')
+ if not episode_url:
+ continue
+ entries.append(self.url_result(
+ compat_urlparse.urljoin(url, episode_url),
+ 'DramaFever', episode.get('guid')))
+ if page_num == episodes['num_pages']:
+ break
+
+ return self.playlist_result(entries, series_id, title, description)
diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py
index 7626219ba..8b98b013a 100644
--- a/youtube_dl/extractor/drbonanza.py
+++ b/youtube_dl/extractor/drbonanza.py
@@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
- 'md5': 'fe330252ddea607635cf2eb2c99a0af3',
'info_dict': {
'id': '65517',
'ext': 'mp4',
@@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
'upload_date': '20110120',
'duration': 3664,
},
+ 'params': {
+ 'skip_download': True, # requires rtmp
+ },
}, {
'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
'md5': '6dfe039417e76795fb783c52da3de11d',
@@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
'format_id': file['Type'].replace('Video', ''),
'preference': preferencemap.get(file['Type'], -10),
})
+ if format['url'].startswith('rtmp'):
+ rtmp_url = format['url']
+ format['rtmp_live'] = True # --resume does not work
+ if '/bonanza/' in rtmp_url:
+ format['play_path'] = rtmp_url.split('/bonanza/')[1]
formats.append(format)
elif file['Type'] == "Thumb":
thumbnail = file['Location']
@@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
description = '%s\n%s\n%s\n' % (
info['Description'], info['Actors'], info['Colophon'])
- for f in formats:
- f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
- f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
self._sort_formats(formats)
display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py
index 37c5c181f..639f9182c 100644
--- a/youtube_dl/extractor/drtuber.py
+++ b/youtube_dl/extractor/drtuber.py
@@ -36,25 +36,24 @@ class DrTuberIE(InfoExtractor):
r'<source src="([^"]+)"', webpage, 'video URL')
title = self._html_search_regex(
- [r'class="hd_title" style="[^"]+">([^<]+)</h1>', r'<title>([^<]+) - \d+'],
+ [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],
webpage, 'title')
thumbnail = self._html_search_regex(
r'poster="([^"]+)"',
webpage, 'thumbnail', fatal=False)
- like_count = str_to_int(self._html_search_regex(
- r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
- webpage, 'like count', fatal=False))
- dislike_count = str_to_int(self._html_search_regex(
- r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
- webpage, 'like count', fatal=False))
- comment_count = str_to_int(self._html_search_regex(
- r'<span class="comments_count">([\d,\.]+)</span>',
- webpage, 'comment count', fatal=False))
+ def extract_count(id_, name):
+ return str_to_int(self._html_search_regex(
+ r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
+ webpage, '%s count' % name, fatal=False))
+
+ like_count = extract_count('rate_likes', 'like')
+ dislike_count = extract_count('rate_dislikes', 'dislike')
+ comment_count = extract_count('comments_count', 'comment')
cats_str = self._search_regex(
- r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False)
+ r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)
categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)
return {
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index f25ab319e..baa24c6d1 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -1,8 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+)
class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
restricted_to_denmark = asset['RestrictedToDenmark']
spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
for link in asset['Links']:
- target = link['Target']
uri = link['Uri']
+ target = link['Target']
format_id = target
- preference = -1 if target == 'HDS' else -2
+ preference = None
if spoken_subtitles:
- preference -= 2
+ preference = -1
format_id += '-spoken-subtitles'
- formats.append({
- 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
- 'format_id': format_id,
- 'ext': link['FileFormat'],
- 'preference': preference,
- })
+ if target == 'HDS':
+ formats.extend(self._extract_f4m_formats(
+ uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+ video_id, preference, f4m_id=format_id))
+ elif target == 'HLS':
+ formats.extend(self._extract_m3u8_formats(
+ uri, video_id, 'mp4', preference=preference,
+ m3u8_id=format_id))
+ else:
+ bitrate = link.get('Bitrate')
+ if bitrate:
+ format_id += '-%s' % bitrate
+ formats.append({
+ 'url': uri,
+ 'format_id': format_id,
+ 'tbr': bitrate,
+ 'ext': link.get('FileFormat'),
+ })
subtitles_list = asset.get('SubtitlesList')
if isinstance(subtitles_list, list):
LANGS = {
diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py
index 9c594b757..999fb5620 100644
--- a/youtube_dl/extractor/dumpert.py
+++ b/youtube_dl/extractor/dumpert.py
@@ -26,7 +26,7 @@ class DumpertIE(InfoExtractor):
video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
- req.add_header('Cookie', 'nsfw=1')
+ req.add_header('Cookie', 'nsfw=1; cpc=10')
webpage = self._download_webpage(req, video_id)
files_base64 = self._search_regex(
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index 9cb1bf301..b1cd4f5d4 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
-from ..compat import (
- compat_urllib_parse,
-)
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
class EHowIE(InfoExtractor):
@@ -26,7 +24,7 @@ class EHowIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
- final_url = compat_urllib_parse.unquote(video_url)
+ final_url = compat_urllib_parse_unquote(video_url)
uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
deleted file mode 100644
index 70f8efe27..000000000
--- a/youtube_dl/extractor/empflix.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import unicode_literals
-
-from .tnaflix import TNAFlixIE
-
-
-class EMPFlixIE(TNAFlixIE):
- _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
-
- _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
- _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
- _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
-
- _TEST = {
- 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
- 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
- 'info_dict': {
- 'id': '33051',
- 'display_id': 'Amateur-Finger-Fuck',
- 'ext': 'mp4',
- 'title': 'Amateur Finger Fuck',
- 'description': 'Amateur solo finger fucking.',
- 'thumbnail': 're:https?://.*\.jpg$',
- 'age_limit': 18,
- }
- }
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
index 0cbca90b0..7fcd0151d 100644
--- a/youtube_dl/extractor/eroprofile.py
+++ b/youtube_dl/extractor/eroprofile.py
@@ -4,7 +4,10 @@ import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ unescapeHTML
+)
class EroProfileIE(InfoExtractor):
@@ -68,15 +71,14 @@ class EroProfileIE(InfoExtractor):
m = re.search(r'You must be logged in to view this video\.', webpage)
if m:
- raise ExtractorError(
- 'This video requires login. Please specify a username and password and try again.', expected=True)
+ self.raise_login_required('This video requires login')
video_id = self._search_regex(
[r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
webpage, 'video id', default=None)
- video_url = self._search_regex(
- r'<source src="([^"]+)', webpage, 'video url')
+ video_url = unescapeHTML(self._search_regex(
+ r'<source src="([^"]+)', webpage, 'video url'))
title = self._html_search_regex(
r'Title:</th><td>([^<]+)</td>', webpage, 'title')
thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 8facf1185..c85b4c458 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -8,7 +8,8 @@ from ..compat import compat_urllib_request
from ..utils import (
determine_ext,
clean_html,
- qualities,
+ int_or_none,
+ float_or_none,
)
@@ -36,10 +37,10 @@ def _decrypt_config(key, string):
class EscapistIE(InfoExtractor):
- _VALID_URL = r'https?://?(www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
+ _VALID_URL = r'https?://?(?:www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
_TESTS = [{
'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
- 'md5': 'c6793dbda81388f4264c1ba18684a74d',
+ 'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
'info_dict': {
'id': '6618',
'ext': 'mp4',
@@ -47,10 +48,11 @@ class EscapistIE(InfoExtractor):
'title': "Breaking Down Baldur's Gate",
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 264,
+ 'uploader': 'The Escapist',
}
}, {
'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer',
- 'md5': 'cf8842a8a46444d241f9a9980d7874f2',
+ 'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf',
'info_dict': {
'id': '10044',
'ext': 'mp4',
@@ -58,6 +60,7 @@ class EscapistIE(InfoExtractor):
'title': 'Evolve - One vs Multiplayer',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 304,
+ 'uploader': 'The Escapist',
}
}]
@@ -65,35 +68,33 @@ class EscapistIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- imsVideo = self._parse_json(
+ ims_video = self._parse_json(
self._search_regex(
r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'),
video_id)
- video_id = imsVideo['videoID']
- key = imsVideo['hash']
+ video_id = ims_video['videoID']
+ key = ims_video['hash']
- quality = qualities(['lq', 'hq', 'hd'])
+ config_req = compat_urllib_request.Request(
+ 'http://www.escapistmagazine.com/videos/'
+ 'vidconfig.php?videoID=%s&hash=%s' % (video_id, key))
+ config_req.add_header('Referer', url)
+ config = self._download_webpage(config_req, video_id, 'Downloading video config')
- formats = []
- for q in ['lq', 'hq', 'hd']:
- config_req = compat_urllib_request.Request(
- 'http://www.escapistmagazine.com/videos/'
- 'vidconfig.php?videoID=%s&hash=%s&quality=%s' % (video_id, key, 'mp4_' + q))
- config_req.add_header('Referer', url)
- config = self._download_webpage(config_req, video_id, 'Downloading video config ' + q.upper())
+ data = json.loads(_decrypt_config(key, config))
- data = json.loads(_decrypt_config(key, config))
+ video_data = data['videoData']
- title = clean_html(data['videoData']['title'])
- duration = data['videoData']['duration'] / 1000
+ title = clean_html(video_data['title'])
+ duration = float_or_none(video_data.get('duration'), 1000)
+ uploader = video_data.get('publisher')
- for i, v in enumerate(data['files']['videos']):
-
- formats.append({
- 'url': v,
- 'format_id': determine_ext(v) + '_' + q + str(i),
- 'quality': quality(q),
- })
+ formats = [{
+ 'url': video['src'],
+ 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']),
+ 'height': int_or_none(video.get('res')),
+ } for video in data['files']['videos']]
+ self._sort_formats(formats)
return {
'id': video_id,
@@ -102,4 +103,5 @@ class EscapistIE(InfoExtractor):
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'duration': duration,
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
new file mode 100644
index 000000000..e6f8f0337
--- /dev/null
+++ b/youtube_dl/extractor/espn.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ESPNIE(InfoExtractor):
+ _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+ _WORKING = False
+ _TESTS = [{
+ 'url': 'http://espn.go.com/video/clip?id=10365079',
+ 'info_dict': {
+ 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+ 'ext': 'mp4',
+ 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+ 'description': '',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/nba/recap?gameId=400793786',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'class="video-play-button"[^>]+data-id="(\d+)',
+ webpage, 'video id')
+
+ player = self._download_webpage(
+ 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id)
+
+ pcode = self._search_regex(
+ r'["\']pcode=([^"\']+)["\']', player, 'pcode')
+
+ return self.url_result(
+ 'ooyalaexternal:espn:%s:%s' % (video_id, pcode),
+ 'OoyalaExternal')
diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py
new file mode 100644
index 000000000..bf5d2019f
--- /dev/null
+++ b/youtube_dl/extractor/esri.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ parse_filesize,
+ unified_strdate,
+)
+
+
+class EsriVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications',
+ 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc',
+ 'info_dict': {
+ 'id': '1124',
+ 'ext': 'mp4',
+ 'title': 'ArcGIS Online - Developing Applications',
+ 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 185,
+ 'upload_date': '20120419',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+ for width, height, content in re.findall(
+ r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage):
+ for video_url, ext, filesize in re.findall(
+ r'<a[^>]+href="([^"]+)">([^<]+)&nbsp;\(([^<]+)\)</a>', content):
+ formats.append({
+ 'url': compat_urlparse.urljoin(url, video_url),
+ 'ext': ext.lower(),
+ 'format_id': '%s-%s' % (ext.lower(), height),
+ 'width': int(width),
+ 'height': int(height),
+ 'filesize_approx': parse_filesize(filesize),
+ })
+ self._sort_formats(formats)
+
+ title = self._html_search_meta('title', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description', fatal=False)
+
+ thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False)
+ if thumbnail:
+ thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail)
+
+ duration = int_or_none(self._search_regex(
+ [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"],
+ webpage, 'duration', fatal=False))
+
+ upload_date = unified_strdate(self._html_search_meta(
+ 'last-modified', webpage, 'upload date', fatal=None))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 937b28fcc..178a7ca4c 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -9,7 +9,7 @@ from ..compat import (
compat_http_client,
compat_str,
compat_urllib_error,
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_request,
)
from ..utils import (
@@ -17,6 +17,8 @@ from ..utils import (
int_or_none,
limit_length,
urlencode_postdata,
+ get_element_by_id,
+ clean_html,
)
@@ -42,6 +44,7 @@ class FacebookIE(InfoExtractor):
'id': '637842556329505',
'ext': 'mp4',
'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
+ 'uploader': 'Tennis on Facebook',
}
}, {
'note': 'Video without discernible title',
@@ -50,7 +53,11 @@ class FacebookIE(InfoExtractor):
'id': '274175099429670',
'ext': 'mp4',
'title': 'Facebook video #274175099429670',
- }
+ 'uploader': 'Asif Nawab Butt',
+ },
+ 'expected_warnings': [
+ 'title'
+ ]
}, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
@@ -133,7 +140,7 @@ class FacebookIE(InfoExtractor):
else:
raise ExtractorError('Cannot parse data')
data = dict(json.loads(m.group(1)))
- params_raw = compat_urllib_parse.unquote(data['params'])
+ params_raw = compat_urllib_parse_unquote(data['params'])
params = json.loads(params_raw)
video_data = params['video_data'][0]
@@ -149,15 +156,16 @@ class FacebookIE(InfoExtractor):
raise ExtractorError('Cannot find video formats')
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
- fatal=False)
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+ default=None)
if not video_title:
video_title = self._html_search_regex(
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
- webpage, 'alternative title', default=None)
+ webpage, 'alternative title', fatal=False)
video_title = limit_length(video_title, 80)
if not video_title:
video_title = 'Facebook video #%s' % video_id
+ uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
return {
'id': video_id,
@@ -165,4 +173,5 @@ class FacebookIE(InfoExtractor):
'formats': formats,
'duration': int_or_none(video_data.get('video_duration')),
'thumbnail': video_data.get('thumbnail_src'),
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index 3c39ca451..cebdd0193 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -6,9 +6,9 @@ from .common import InfoExtractor
class FazIE(InfoExtractor):
IE_NAME = 'faz.net'
- _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
'info_dict': {
'id': '12610585',
@@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
'description': 'md5:1453fbf9a0d041d985a47306192ea253',
},
- }
+ }, {
+ 'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
index 1ccc1a964..e4f7195a8 100644
--- a/youtube_dl/extractor/fc2.py
+++ b/youtube_dl/extractor/fc2.py
@@ -86,7 +86,7 @@ class FC2IE(InfoExtractor):
info_url = (
"http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
- format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.', '%2E')))
+ format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
info_webpage = self._download_webpage(
info_url, video_id, note='Downloading info page')
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644
index 3191116d9..000000000
--- a/youtube_dl/extractor/firedrive.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
- '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
- _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
- _TESTS = [{
- 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
- 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
- 'info_dict': {
- 'id': 'FEB892FA160EBD01',
- 'ext': 'flv',
- 'title': 'bbb_theora_486kbit.flv',
- 'thumbnail': 're:^http://.*\.jpg$',
- },
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://firedrive.com/file/%s' % video_id
- webpage = self._download_webpage(url, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', webpage))
-
- post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- # Apparently, this header is required for confirmation to work.
- req.add_header('Host', 'www.firedrive.com')
-
- webpage = self._download_webpage(req, video_id,
- 'Downloading video page')
-
- title = self._search_regex(r'class="external_title_left">(.+)</div>',
- webpage, 'title')
- thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
- 'thumbnail', fatal=False)
- if thumbnail is not None:
- thumbnail = 'http:' + thumbnail
-
- ext = self._search_regex(r'type:\s?\'([^\']+)\',',
- webpage, 'extension', fatal=False)
- video_url = self._search_regex(
- r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'ext': ext,
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
new file mode 100644
index 000000000..13fbc4da2
--- /dev/null
+++ b/youtube_dl/extractor/fivetv.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ http://
+ (?:www\.)?5-tv\.ru/
+ (?:
+ (?:[^/]+/)+(?P<id>\d+)|
+ (?P<path>[^/?#]+)(?:[/?#])?
+ )
+ '''
+
+ _TESTS = [{
+ 'url': 'http://5-tv.ru/news/96814/',
+ 'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+ 'info_dict': {
+ 'id': '96814',
+ 'ext': 'mp4',
+ 'title': 'Россияне выбрали имя для общенациональной платежной системы',
+ 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 180,
+ },
+ }, {
+ 'url': 'http://5-tv.ru/video/1021729/',
+ 'info_dict': {
+ 'id': '1021729',
+ 'ext': 'mp4',
+ 'title': '3D принтер',
+ 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 180,
+ },
+ }, {
+ 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+ 'info_dict': {
+ 'id': 'glavnoe',
+ 'ext': 'mp4',
+ 'title': 'Итоги недели с 8 по 14 июня 2015 года',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/films/1507502/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/programs/broadcast/508713/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/angel/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('path')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+ webpage, 'video url')
+
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title')
+ duration = int_or_none(self._og_search_property(
+ 'video:duration', webpage, 'duration', default=None))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py
index 0fb29de75..75399fa7d 100644
--- a/youtube_dl/extractor/folketinget.py
+++ b/youtube_dl/extractor/folketinget.py
@@ -30,6 +30,10 @@ class FolketingetIE(InfoExtractor):
'upload_date': '20141120',
'duration': 3960,
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index b2284ab01..3bb4f6239 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -32,6 +32,7 @@ class FourTubeIE(InfoExtractor):
'view_count': int,
'like_count': int,
'categories': list,
+ 'age_limit': 18,
}
}
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
index 917f76b1e..3a4a59135 100644
--- a/youtube_dl/extractor/foxnews.py
+++ b/youtube_dl/extractor/foxnews.py
@@ -1,5 +1,7 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
@@ -8,7 +10,8 @@ from ..utils import (
class FoxNewsIE(InfoExtractor):
- _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
+ IE_DESC = 'Fox News and Fox Business Video'
+ _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
_TESTS = [
{
'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips',
@@ -42,13 +45,19 @@ class FoxNewsIE(InfoExtractor):
'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
'only_matching': True,
},
+ {
+ 'url': 'http://video.foxbusiness.com/v/4442309889001',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
video = self._download_json(
- 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id)
+ 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id)
item = video['channel']['item']
title = item['title']
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
index 363866b64..df7665176 100644
--- a/youtube_dl/extractor/foxsports.py
+++ b/youtube_dl/extractor/foxsports.py
@@ -5,7 +5,7 @@ from ..utils import smuggle_url
class FoxSportsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?foxsports\.com/video\?vid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TEST = {
'url': 'http://www.foxsports.com/video?vid=432609859715',
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index edf555b29..75723c00d 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -6,18 +6,15 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
- float_or_none,
parse_duration,
determine_ext,
)
+from .dailymotion import DailymotionCloudIE
class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -58,12 +55,12 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
# See https://github.com/rg3/youtube-dl/issues/3963
# m3u8 urls work fine
continue
- video_url_parsed = compat_urllib_parse_urlparse(video_url)
f4m_url = self._download_webpage(
- 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path,
+ 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url,
video_id, 'Downloading f4m manifest token', fatal=False)
if f4m_url:
- formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id))
+ formats.extend(self._extract_f4m_formats(
+ f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id))
elif video_url.startswith('rtmp'):
@@ -86,7 +83,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
'title': info['titre'],
'description': clean_html(info['synopsis']),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
- 'duration': float_or_none(info.get('real_duration'), 1000) or parse_duration(info['duree']),
+ 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'formats': formats,
}
@@ -131,12 +128,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'skip_download': 'HLS (reqires ffmpeg)'
},
'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
+ }, {
+ 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+ 'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+ 'info_dict': {
+ 'id': '556e03339473995ee145930c',
+ 'ext': 'mp4',
+ 'title': 'Les entreprises familiales : le secret de la réussite',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ }
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
+
+ dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+ if dmcloud_url:
+ return self.url_result(dmcloud_url, 'DailymotionCloud')
+
video_id, catalogue = self._search_regex(
r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
return self._extract_video(video_id, catalogue)
@@ -145,11 +156,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
class FranceTVIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetv'
IE_DESC = 'France 2, 3, 4, 5 and Ô'
- _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
- (?:
- emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
- | (emissions?|jt)/(?P<key>[^/?]+)
- )'''
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?france[2345o]\.fr/
+ (?:
+ emissions/[^/]+/(?:videos|diffusions)|
+ emission/[^/]+|
+ videos|
+ jt
+ )
+ /|
+ embed\.francetv\.fr/\?ue=
+ )
+ (?P<id>[^/?]+)
+ '''
_TESTS = [
# france2
@@ -206,24 +227,46 @@ class FranceTVIE(FranceTVBaseInfoExtractor):
},
# franceo
{
- 'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013',
- 'md5': '52f0bfe202848b15915a2f39aaa8981b',
+ 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
+ 'md5': '47d5816d3b24351cdce512ad7ab31da8',
'info_dict': {
- 'id': '108634970',
+ 'id': '125377621',
'ext': 'flv',
- 'title': 'Infô Afrique',
- 'description': 'md5:ebf346da789428841bee0fd2a935ea55',
- 'upload_date': '20140915',
- 'timestamp': 1410822000,
+ 'title': 'Infô soir',
+ 'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
+ 'upload_date': '20150718',
+ 'timestamp': 1437241200,
+ 'duration': 414,
+ },
+ },
+ {
+ # francetv embed
+ 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
+ 'info_dict': {
+ 'id': 'EV_30231',
+ 'ext': 'flv',
+ 'title': 'Alcaline, le concert avec Calogero',
+ 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+ 'upload_date': '20150226',
+ 'timestamp': 1424989860,
+ 'duration': 5400,
},
},
+ {
+ 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.franceo.fr/videos/125377617',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id'))
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
video_id, catalogue = self._html_search_regex(
- r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+ r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
webpage, 'video ID').split('@')
return self._extract_video(video_id, catalogue)
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index dd87257c4..f5f13689c 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -53,7 +53,7 @@ class FunnyOrDieIE(InfoExtractor):
for bitrate in bitrates:
for link in links:
formats.append({
- 'url': '%s%d.%s' % (link[0], bitrate, link[1]),
+ 'url': self._proto_relative_url('%s%d.%s' % (link[0], bitrate, link[1])),
'format_id': '%s-%d' % (link[1], bitrate),
'vbr': bitrate,
})
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 47373e215..b3f1bafcc 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -5,7 +5,7 @@ import json
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import (
@@ -14,8 +14,8 @@ from ..utils import (
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
- _TEST = {
+ _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
+ _TESTS = [{
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
'info_dict': {
@@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor):
'ext': 'mp4',
'title': 'Arma 3 - Community Guide: SITREP I',
'description': 'Check out this video where some of the basics of Arma 3 is explained.',
- }
- }
+ },
+ }, {
+ 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+ 'info_dict': {
+ 'id': 'gs-2300-6424837',
+ 'ext': 'flv',
+ 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing',
+ 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
+ },
+ }]
def _real_extract(self, url):
page_id = self._match_id(url)
@@ -32,30 +40,42 @@ class GameSpotIE(InfoExtractor):
data_video_json = self._search_regex(
r'data-video=["\'](.*?)["\']', webpage, 'data video')
data_video = json.loads(unescapeHTML(data_video_json))
+ streams = data_video['videoStreams']
- # Transform the manifest url to a link to the mp4 files
- # they are used in mobile devices.
- f4m_url = data_video['videoStreams']['f4m_stream']
- f4m_path = compat_urlparse.urlparse(f4m_url).path
- QUALITIES_RE = r'((,\d+)+,?)'
- qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
- http_path = f4m_path[1:].split('/', 1)[1]
- http_template = re.sub(QUALITIES_RE, r'%s', http_path)
- http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin(
- 'http://video.gamespotcdn.com/', http_template)
formats = []
- for q in qualities:
- formats.append({
- 'url': http_template % q,
- 'ext': 'mp4',
- 'format_id': q,
- })
+ f4m_url = streams.get('f4m_stream')
+ if f4m_url is not None:
+ # Transform the manifest url to a link to the mp4 files
+ # they are used in mobile devices.
+ f4m_path = compat_urlparse.urlparse(f4m_url).path
+ QUALITIES_RE = r'((,\d+)+,?)'
+ qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
+ http_path = f4m_path[1:].split('/', 1)[1]
+ http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+ http_template = http_template.replace('.csmil/manifest.f4m', '')
+ http_template = compat_urlparse.urljoin(
+ 'http://video.gamespotcdn.com/', http_template)
+ for q in qualities:
+ formats.append({
+ 'url': http_template % q,
+ 'ext': 'mp4',
+ 'format_id': q,
+ })
+ else:
+ for quality in ['sd', 'hd']:
+ # It's actually a link to a flv file
+ flv_url = streams.get('f4m_{0}'.format(quality))
+ if flv_url is not None:
+ formats.append({
+ 'url': flv_url,
+ 'ext': 'flv',
+ 'format_id': quality,
+ })
return {
'id': data_video['guid'],
'display_id': page_id,
- 'title': compat_urllib_parse.unquote(data_video['title']),
+ 'title': compat_urllib_parse_unquote(data_video['title']),
'formats': formats,
'description': self._html_search_meta('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 43f916412..a6834db43 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -7,7 +7,10 @@ from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
-from ..utils import remove_end
+from ..utils import (
+ remove_end,
+ HEADRequest,
+)
class GDCVaultIE(InfoExtractor):
@@ -73,10 +76,20 @@ class GDCVaultIE(InfoExtractor):
return video_formats
def _parse_flv(self, xml_description):
- video_formats = []
+ formats = []
akamai_url = xml_description.find('./metadata/akamaiHost').text
+ audios = xml_description.find('./metadata/audios')
+ if audios is not None:
+ for audio in audios:
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(audio.get('url'), '.flv'),
+ 'ext': 'flv',
+ 'vcodec': 'none',
+ 'format_id': audio.get('code'),
+ })
slide_video_path = xml_description.find('./metadata/slideVideo').text
- video_formats.append({
+ formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(slide_video_path, '.flv'),
'ext': 'flv',
@@ -86,7 +99,7 @@ class GDCVaultIE(InfoExtractor):
'format_id': 'slides',
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
- video_formats.append({
+ formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(speaker_video_path, '.flv'),
'ext': 'flv',
@@ -95,7 +108,7 @@ class GDCVaultIE(InfoExtractor):
'preference': -1,
'format_id': 'speaker',
})
- return video_formats
+ return formats
def _login(self, webpage_url, display_id):
(username, password) = self._get_login_info()
@@ -133,16 +146,18 @@ class GDCVaultIE(InfoExtractor):
r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',
start_page, 'url', default=None)
if direct_url:
- video_url = 'http://www.gdcvault.com/' + direct_url
title = self._html_search_regex(
r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>',
start_page, 'title')
+ video_url = 'http://www.gdcvault.com' + direct_url
+ # resolve the url so that we can detect the correct extension
+ head = self._request_webpage(HEADRequest(video_url), video_id)
+ video_url = head.geturl()
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
- 'ext': 'flv',
'title': title,
}
@@ -168,8 +183,8 @@ class GDCVaultIE(InfoExtractor):
# Fallback to the older format
xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
- xml_decription_url = xml_root + 'xml/' + xml_name
- xml_description = self._download_xml(xml_decription_url, display_id)
+ xml_description_url = xml_root + 'xml/' + xml_name
+ xml_description = self._download_xml(xml_description_url, display_id)
video_title = xml_description.find('./metadata/title').text
video_formats = self._parse_mp4(xml_description)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 4946cc132..953ec32c3 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -8,7 +8,8 @@ import re
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
)
@@ -32,11 +33,22 @@ from .brightcove import BrightcoveIE
from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
+from .tvc import TVCIE
+from .sportbox import SportBoxEmbedIE
from .smotri import SmotriIE
+from .myvi import MyviIE
from .condenast import CondeNastIE
from .udn import UDNEmbedIE
from .senateisvp import SenateISVPIE
from .bliptv import BlipTVIE
+from .svt import SVTIE
+from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionCloudIE
+from .onionstudios import OnionStudiosIE
+from .snagfilms import SnagFilmsEmbedIE
+from .screenwavemedia import ScreenwaveMediaIE
class GenericIE(InfoExtractor):
@@ -44,6 +56,180 @@ class GenericIE(InfoExtractor):
_VALID_URL = r'.*'
IE_NAME = 'generic'
_TESTS = [
+ # Direct link to a video
+ {
+ 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+ 'info_dict': {
+ 'id': 'trailer',
+ 'ext': 'mp4',
+ 'title': 'trailer',
+ 'upload_date': '20100513',
+ }
+ },
+ # Direct link to media delivered compressed (until Accept-Encoding is *)
+ {
+ 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+ 'md5': '128c42e68b13950268b648275386fc74',
+ 'info_dict': {
+ 'id': 'FictionJunction-Parallel_Hearts',
+ 'ext': 'flac',
+ 'title': 'FictionJunction-Parallel_Hearts',
+ 'upload_date': '20140522',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # RSS feed
+ {
+ 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'info_dict': {
+ 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'title': 'Zero Punctuation',
+ 'description': 're:.*groundbreaking video review series.*'
+ },
+ 'playlist_mincount': 11,
+ },
+ # RSS feed with enclosure
+ {
+ 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'info_dict': {
+ 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ 'ext': 'm4v',
+ 'upload_date': '20150228',
+ 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ }
+ },
+ # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
+ {
+ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
+ 'info_dict': {
+ 'id': 'smil',
+ 'ext': 'mp4',
+ 'title': 'Automatics, robotics and biocybernetics',
+ 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'formats': 'mincount:16',
+ 'subtitles': 'mincount:1',
+ },
+ 'params': {
+ 'force_generic_extractor': True,
+ 'skip_download': True,
+ },
+ },
+ # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
+ {
+ 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
+ 'info_dict': {
+ 'id': 'hds',
+ 'ext': 'flv',
+ 'title': 'hds',
+ 'formats': 'mincount:1',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # SMIL from https://www.restudy.dk/video/play/id/1637
+ {
+ 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
+ 'info_dict': {
+ 'id': 'video_1637',
+ 'ext': 'flv',
+ 'title': 'video_1637',
+ 'formats': 'mincount:3',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
+ {
+ 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
+ 'info_dict': {
+ 'id': 'smil-service',
+ 'ext': 'flv',
+ 'title': 'smil-service',
+ 'formats': 'mincount:1',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
+ {
+ 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
+ 'info_dict': {
+ 'id': '4719370',
+ 'ext': 'mp4',
+ 'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
+ 'formats': 'mincount:3',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html
+ {
+ 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf',
+ 'info_dict': {
+ 'id': 'mZlp2ctYIUEB',
+ 'ext': 'mp4',
+ 'title': 'Tikibad ontruimd wegens brand',
+ 'description': 'md5:05ca046ff47b931f9b04855015e163a4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 33,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ },
{
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -123,17 +309,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, # m3u8 download
},
},
- # Direct link to a video
- {
- 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
- 'info_dict': {
- 'id': 'trailer',
- 'ext': 'mp4',
- 'title': 'trailer',
- 'upload_date': '20100513',
- }
- },
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -145,6 +320,19 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
+ {
+ # ooyala video embedded with http://player.ooyala.com/iframe.js
+ 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
+ 'info_dict': {
+ 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
+ 'ext': 'mp4',
+ 'title': '"Steve Jobs: Man in the Machine" trailer',
+ 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# multiple ooyala embeds on SBN network websites
{
'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@@ -158,22 +346,6 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
- # google redirect
- {
- 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
- 'info_dict': {
- 'id': 'cmQHVoWB5FY',
- 'ext': 'mp4',
- 'upload_date': '20130224',
- 'uploader_id': 'TheVerge',
- 'description': 're:^Chris Ziegler takes a look at the\.*',
- 'uploader': 'The Verge',
- 'title': 'First Firefox OS phones side-by-side',
- },
- 'params': {
- 'skip_download': False,
- }
- },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -201,14 +373,6 @@ class GenericIE(InfoExtractor):
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
- # BBC iPlayer embeds
- {
- 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
- 'info_dict': {
- 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
- },
- 'playlist_mincount': 18,
- },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
@@ -223,6 +387,66 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # TVC embed
+ {
+ 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+ 'info_dict': {
+ 'id': '55304',
+ 'ext': 'mp4',
+ 'title': 'Дошкольное воспитание',
+ },
+ },
+ # SportBox embed
+ {
+ 'url': 'http://www.vestifinance.ru/articles/25753',
+ 'info_dict': {
+ 'id': '25753',
+ 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '370908',
+ 'title': 'Госзаказ. День 3',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'info_dict': {
+ 'id': '370905',
+ 'title': 'Госзаказ. День 2',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'info_dict': {
+ 'id': '370902',
+ 'title': 'Госзаказ. День 1',
+ 'ext': 'mp4',
+ }
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ # Myvi.ru embed
+ {
+ 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
+ 'info_dict': {
+ 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
+ 'ext': 'mp4',
+ 'title': 'Ужастики, русский трейлер (2015)',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 153,
+ }
+ },
+ # XHamster embed
+ {
+ 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+ 'info_dict': {
+ 'id': 'showthread',
+ 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+ },
+ 'playlist_mincount': 7,
+ },
# Embedded TED video
{
'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -272,6 +496,26 @@ class GenericIE(InfoExtractor):
'skip_download': 'Requires rtmpdump'
}
},
+ # francetv embed
+ {
+ 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
+ 'info_dict': {
+ 'id': 'EV_30231',
+ 'ext': 'mp4',
+ 'title': 'Alcaline, le concert avec Calogero',
+ 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+ 'upload_date': '20150226',
+ 'timestamp': 1424989860,
+ 'duration': 5400,
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ 'expected_warnings': [
+ 'Forbidden'
+ ]
+ },
# Condé Nast embed
{
'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -374,16 +618,6 @@ class GenericIE(InfoExtractor):
'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
- # RSS feed
- {
- 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'info_dict': {
- 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'title': 'Zero Punctuation',
- 'description': 're:.*groundbreaking video review series.*'
- },
- 'playlist_mincount': 11,
- },
# Multiple brightcove videos
# https://github.com/rg3/youtube-dl/issues/2283
{
@@ -437,21 +671,6 @@ class GenericIE(InfoExtractor):
'uploader': 'thoughtworks.wistia.com',
},
},
- # Direct download with broken HEAD
- {
- 'url': 'http://ai-radio.org:8000/radio.opus',
- 'info_dict': {
- 'id': 'radio',
- 'ext': 'opus',
- 'title': 'radio',
- },
- 'params': {
- 'skip_download': True, # infinite live stream
- },
- 'expected_warnings': [
- r'501.*Not Implemented'
- ],
- },
# Soundcloud embed
{
'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -483,21 +702,6 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 2,
},
- # Direct link with incorrect MIME type
- {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'md5': '4ccbebe5f36706d85221f204d7eb5913',
- 'info_dict': {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'id': '5_Lennart_Poettering_-_Systemd',
- 'ext': 'webm',
- 'title': '5_Lennart_Poettering_-_Systemd',
- 'upload_date': '20141120',
- },
- 'expected_warnings': [
- 'URL could be a direct video link, returning it as such.'
- ]
- },
# Cinchcast embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -585,6 +789,18 @@ class GenericIE(InfoExtractor):
'title': 'John Carlson Postgame 2/25/15',
},
},
+ # Kaltura embed (different embed code)
+ {
+ 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+ 'info_dict': {
+ 'id': '1_a52wc67y',
+ 'ext': 'flv',
+ 'upload_date': '20150127',
+ 'uploader_id': 'PremierMedia',
+ 'timestamp': int,
+ 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+ },
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -645,15 +861,16 @@ class GenericIE(InfoExtractor):
'title': 'Facebook Creates "On This Day" | Crunch Report',
},
},
- # RSS feed with enclosure
+ # SVT embed
{
- 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
'info_dict': {
- 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- 'ext': 'm4v',
- 'upload_date': '20150228',
- 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- }
+ 'id': '2900353',
+ 'ext': 'flv',
+ 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+ 'duration': 27,
+ 'age_limit': 0,
+ },
},
# Crooks and Liars embed
{
@@ -729,6 +946,72 @@ class GenericIE(InfoExtractor):
# rtmpe downloads
'skip_download': True,
}
+ },
+ # Brightcove URL in single quotes
+ {
+ 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+ 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+ 'info_dict': {
+ 'id': '4255764656001',
+ 'ext': 'mp4',
+ 'title': 'SN Presents: Russell Martin, World Citizen',
+ 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+ 'uploader': 'Rogers Sportsnet',
+ },
+ },
+ # Dailymotion Cloud video
+ {
+ 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
+ 'md5': '49444254273501a64675a7e68c502681',
+ 'info_dict': {
+ 'id': '5585de919473990de4bee11b',
+ 'ext': 'mp4',
+ 'title': 'Le débat',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ }
+ },
+ # OnionStudios embed
+ {
+ 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
+ 'info_dict': {
+ 'id': '2855',
+ 'ext': 'mp4',
+ 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ 'uploader': 'ClickHole',
+ 'uploader_id': 'clickhole',
+ }
+ },
+ # SnagFilms embed
+ {
+ 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
+ 'info_dict': {
+ 'id': '74849a00-85a9-11e1-9660-123139220831',
+ 'ext': 'mp4',
+ 'title': '#whilewewatch',
+ }
+ },
+ # AdobeTVVideo embed
+ {
+ 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+ 'md5': '43662b577c018ad707a63766462b1e87',
+ 'info_dict': {
+ 'id': '2456',
+ 'ext': 'mp4',
+ 'title': 'New experience with Acrobat DC',
+ 'description': 'New experience with Acrobat DC',
+ 'duration': 248.667,
+ },
+ },
+ # ScreenwaveMedia embed
+ {
+ 'url': 'http://www.thecinemasnob.com/the-cinema-snob/a-nightmare-on-elm-street-2-freddys-revenge1',
+ 'md5': '24ace5baba0d35d55c6810b51f34e9e0',
+ 'info_dict': {
+ 'id': 'cinemasnob-55d26273809dd',
+ 'ext': 'mp4',
+ 'title': 'cinemasnob',
+ },
}
]
@@ -850,7 +1133,7 @@ class GenericIE(InfoExtractor):
force_videoid = smuggled_data['force_videoid']
video_id = force_videoid
else:
- video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+ video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
self.to_screen('%s: Requesting header' % video_id)
@@ -872,7 +1155,9 @@ class GenericIE(InfoExtractor):
full_response = None
if head_response is False:
- full_response = self._request_webpage(url, video_id)
+ request = compat_urllib_request.Request(url)
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
head_response = full_response
# Check for direct link to a video
@@ -883,7 +1168,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'formats': [{
'format_id': m.group('format_id'),
@@ -894,10 +1179,22 @@ class GenericIE(InfoExtractor):
}
if not self._downloader.params.get('test', False) and not is_intentional:
- self._downloader.report_warning('Falling back on generic information extractor.')
+ force = self._downloader.params.get('force_generic_extractor', False)
+ self._downloader.report_warning(
+ '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
if not full_response:
- full_response = self._request_webpage(url, video_id)
+ request = compat_urllib_request.Request(url)
+ # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+ # making it impossible to download only chunk of the file (yet we need only 512kB to
+ # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+ # that will always result in downloading the whole file that is not desirable.
+ # Therefore for extraction pass we have to override Accept-Encoding to any in order
+ # to accept raw bytes and being able to download only a chunk.
+ # It may probably better to solve this by checking Content-Type for application/octet-stream
+ # after HEAD request finishes, but not sure if we can rely on this.
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
@@ -909,7 +1206,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'url': url,
'upload_date': upload_date,
@@ -920,11 +1217,15 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id)
- # Is it an RSS feed?
+ # Is it an RSS feed, a SMIL file or a XSPF playlist?
try:
doc = parse_xml(webpage)
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
+ elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
+ return self._parse_smil(doc, url, video_id)
+ elif doc.tag == '{http://xspf.org/ns/0/}playlist':
+ return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
except compat_xml_parse_error:
pass
@@ -936,7 +1237,7 @@ class GenericIE(InfoExtractor):
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/rg3/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
- webpage = compat_urllib_parse.unquote(webpage)
+ webpage = compat_urllib_parse_unquote(webpage)
# it's tempting to parse this further, but you would
# have to take into account all the variations like
@@ -989,23 +1290,20 @@ class GenericIE(InfoExtractor):
# Look for embedded rtl.nl player
matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
+ r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
webpage)
if matches:
return _playlist_from_matches(matches, ie='RtlNl')
- # Look for embedded (iframe) Vimeo player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
- if mobj:
- player_url = unescapeHTML(mobj.group('url'))
- surl = smuggle_url(player_url, {'Referer': url})
- return self.url_result(surl)
- # Look for embedded (swf embed) Vimeo player
- mobj = re.search(
- r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
- if mobj:
- return self.url_result(mobj.group(1))
+ vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+ if vimeo_url is not None:
+ return self.url_result(vimeo_url)
+
+ vid_me_embed_url = self._search_regex(
+ r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+ webpage, 'vid.me embed', default=None)
+ if vid_me_embed_url is not None:
+ return self.url_result(vid_me_embed_url, 'Vidme')
# Look for embedded YouTube player
matches = re.findall(r'''(?x)
@@ -1078,6 +1376,11 @@ class GenericIE(InfoExtractor):
if bliptv_url:
return self.url_result(bliptv_url, 'BlipTV')
+ # Look for SVT player
+ svt_url = SVTIE._extract_url(webpage)
+ if svt_url:
+ return self.url_result(svt_url, 'SVT')
+
# Look for embedded condenast player
matches = re.findall(
r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
@@ -1128,7 +1431,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for Ooyala videos
- mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
@@ -1194,7 +1497,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
if mobj is not None:
- return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+ return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -1212,6 +1515,32 @@ class GenericIE(InfoExtractor):
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
+ # Look for embedded TVC player
+ tvc_url = TVCIE._extract_url(webpage)
+ if tvc_url:
+ return self.url_result(tvc_url, 'TVC')
+
+ # Look for embedded SportBox player
+ sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+ if sportbox_urls:
+ return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+
+ # Look for embedded PornHub player
+ pornhub_url = PornHubIE._extract_url(webpage)
+ if pornhub_url:
+ return self.url_result(pornhub_url, 'PornHub')
+
+ # Look for embedded XHamster player
+ xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+ if xhamster_urls:
+ return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+
+ # Look for embedded Tvigle player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Tvigle')
+
# Look for embedded TED player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
@@ -1231,11 +1560,23 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+ # Look for embedded francetv player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for embedded smotri.com player
smotri_url = SmotriIE._extract_url(webpage)
if smotri_url:
return self.url_result(smotri_url, 'Smotri')
+ # Look for embedded Myvi.ru player
+ myvi_url = MyviIE._extract_url(webpage)
+ if myvi_url:
+ return self.url_result(myvi_url)
+
# Look for embeded soundcloud player
mobj = re.search(
r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
@@ -1289,6 +1630,10 @@ class GenericIE(InfoExtractor):
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)
+ if not mobj:
+ mobj = re.search(
+ r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
+ webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
@@ -1311,8 +1656,8 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = re.search(
- r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
if mobj is not None:
return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
@@ -1367,7 +1712,36 @@ class GenericIE(InfoExtractor):
# Look for Senate ISVP iframe
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
if senate_isvp_url:
- return self.url_result(surl, 'SenateISVP')
+ return self.url_result(senate_isvp_url, 'SenateISVP')
+
+ # Look for Dailymotion Cloud videos
+ dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+ if dmcloud_url:
+ return self.url_result(dmcloud_url, 'DailymotionCloud')
+
+ # Look for OnionStudios embeds
+ onionstudios_url = OnionStudiosIE._extract_url(webpage)
+ if onionstudios_url:
+ return self.url_result(onionstudios_url)
+
+ # Look for SnagFilms embeds
+ snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
+ if snagfilms_url:
+ return self.url_result(snagfilms_url)
+
+ # Look for ScreenwaveMedia embeds
+ mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage)
+ if mobj is not None:
+ return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia')
+
+ # Look for AdobeTVVideo embeds
+ mobj = re.search(
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+ webpage)
+ if mobj is not None:
+ return self.url_result(
+ self._proto_relative_url(unescapeHTML(mobj.group(1))),
+ 'AdobeTVVideo')
def check_video(vurl):
if YoutubeIE.suitable(vurl):
@@ -1397,7 +1771,7 @@ class GenericIE(InfoExtractor):
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
found = filter_video(re.findall(
- r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
+ r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
if not found:
# Flow player
found = filter_video(re.findall(r'''(?xs)
@@ -1436,7 +1810,7 @@ class GenericIE(InfoExtractor):
if refresh_header:
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
- new_url = found.group(1)
+ new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
self.report_following_redirect(new_url)
return {
'_type': 'url',
@@ -1448,7 +1822,7 @@ class GenericIE(InfoExtractor):
entries = []
for video_url in found:
video_url = compat_urlparse.urljoin(url, video_url)
- video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
# Sometimes, jwplayer extraction will result in a YouTube URL
if YoutubeIE.suitable(video_url):
@@ -1458,7 +1832,8 @@ class GenericIE(InfoExtractor):
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
- if determine_ext(video_url) == 'smil':
+ ext = determine_ext(video_url)
+ if ext == 'smil':
entries.append({
'id': video_id,
'formats': self._extract_smil_formats(video_url, video_id),
@@ -1466,6 +1841,8 @@ class GenericIE(InfoExtractor):
'title': video_title,
'age_limit': age_limit,
})
+ elif ext == 'xspf':
+ return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
else:
entries.append({
'id': video_id,
diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py
index 397f1d42e..884700c52 100644
--- a/youtube_dl/extractor/gfycat.py
+++ b/youtube_dl/extractor/gfycat.py
@@ -6,12 +6,13 @@ from ..utils import (
int_or_none,
float_or_none,
qualities,
+ ExtractorError,
)
class GfycatIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?P<id>[^/?#]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/)?(?P<id>[^/?#]+)'
+ _TESTS = [{
'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
'info_dict': {
'id': 'DeadlyDecisiveGermanpinscher',
@@ -27,14 +28,33 @@ class GfycatIE(InfoExtractor):
'categories': list,
'age_limit': 0,
}
- }
+ }, {
+ 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa',
+ 'info_dict': {
+ 'id': 'JauntyTimelyAmazontreeboa',
+ 'ext': 'mp4',
+ 'title': 'JauntyTimelyAmazontreeboa',
+ 'timestamp': 1411720126,
+ 'upload_date': '20140926',
+ 'uploader': 'anonymous',
+ 'duration': 3.52,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'categories': list,
+ 'age_limit': 0,
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
gfy = self._download_json(
'http://gfycat.com/cajax/get/%s' % video_id,
- video_id, 'Downloading video info')['gfyItem']
+ video_id, 'Downloading video info')
+ if 'error' in gfy:
+ raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True)
+ gfy = gfy['gfyItem']
title = gfy.get('title') or gfy['gfyName']
description = gfy.get('description')
diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py
index 775890112..28eb733e2 100644
--- a/youtube_dl/extractor/giga.py
+++ b/youtube_dl/extractor/giga.py
@@ -85,7 +85,8 @@ class GigaIE(InfoExtractor):
r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False)
view_count = str_to_int(self._search_regex(
- r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False))
+ r'<span class="views"><strong>([\d.,]+)</strong>',
+ webpage, 'view count', fatal=False))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py
index ed2623456..f006f0cb1 100644
--- a/youtube_dl/extractor/gorillavid.py
+++ b/youtube_dl/extractor/gorillavid.py
@@ -35,13 +35,7 @@ class GorillaVidIE(InfoExtractor):
},
}, {
'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html',
- 'md5': 'c9e293ca74d46cad638e199c3f3fe604',
- 'info_dict': {
- 'id': 'z08zf8le23c6',
- 'ext': 'mp4',
- 'title': 'Say something nice',
- 'thumbnail': 're:http://.*\.jpg',
- },
+ 'only_matching': True,
}, {
'url': 'http://daclips.in/3rso4kdn6f9m',
'md5': '1ad8fd39bb976eeb66004d3a4895f106',
@@ -84,12 +78,7 @@ class GorillaVidIE(InfoExtractor):
if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- (?:id="[^"]+"\s+)?
- value="([^"]*)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
if fields['op'] == 'download1':
countdown = int_or_none(self._search_regex(
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py
index 63d87b74c..f5aa73d18 100644
--- a/youtube_dl/extractor/hentaistigma.py
+++ b/youtube_dl/extractor/hentaistigma.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
- r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+ r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
webpage, 'title')
wrap_url = self._html_search_regex(
- r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+ r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
wrap_webpage = self._download_webpage(wrap_url, video_id)
video_url = self._html_search_regex(
- r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+ r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/historicfilms.py b/youtube_dl/extractor/historicfilms.py
index 40afbe537..6a36933ac 100644
--- a/youtube_dl/extractor/historicfilms.py
+++ b/youtube_dl/extractor/historicfilms.py
@@ -25,7 +25,8 @@ class HistoricFilmsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
tape_id = self._search_regex(
- r'class="tapeId">([^<]+)<', webpage, 'tape id')
+ [r'class="tapeId"[^>]*>([^<]+)<', r'tapeId\s*:\s*"([^"]+)"'],
+ webpage, 'tape id')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py
index 704d0285d..a3154cfde 100644
--- a/youtube_dl/extractor/hostingbulk.py
+++ b/youtube_dl/extractor/hostingbulk.py
@@ -58,11 +58,7 @@ class HostingBulkIE(InfoExtractor):
r'<img src="([^"]+)".+?class="pic"',
webpage, 'thumbnail', fatal=False)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
request = compat_urllib_request.Request(url, urlencode_postdata(fields))
request.add_header('Content-type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index 3f7d6666c..16677f179 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -1,8 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import parse_iso8601
class HowcastIE(InfoExtractor):
@@ -13,29 +12,31 @@ class HowcastIE(InfoExtractor):
'info_dict': {
'id': '390161',
'ext': 'mp4',
- 'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
'title': 'How to Tie a Square Knot Properly',
- }
+ 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3',
+ 'timestamp': 1276081287,
+ 'upload_date': '20100609',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ video_id = self._match_id(url)
- video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
- video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
- webpage, 'video URL')
-
- video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
- webpage, 'description', fatal=False)
+ embed_code = self._search_regex(
+ r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b',
+ webpage, 'ooyala embed code')
return {
+ '_type': 'url_transparent',
+ 'ie_key': 'Ooyala',
+ 'url': 'ooyala:%s' % embed_code,
'id': video_id,
- 'url': video_url,
- 'title': self._og_search_title(webpage),
- 'description': video_description,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'timestamp': parse_iso8601(self._html_search_meta(
+ 'article:published_time', webpage, 'timestamp')),
}
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
index e97339121..663e6632a 100644
--- a/youtube_dl/extractor/howstuffworks.py
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -10,7 +10,7 @@ from ..utils import (
class HowStuffWorksIE(InfoExtractor):
- _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm'
+ _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm'
_TESTS = [
{
'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
@@ -46,6 +46,10 @@ class HowStuffWorksIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
},
},
+ {
+ 'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f29df36b5..4bb574cf3 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):
format_info = info['videoPlayerObject']['video']
formats.append({
'format_id': f_id,
- 'url': format_info['url'],
+ 'url': format_info['videoInfoList'][0]['videoUrl'],
})
return {
diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py
index fe5d95e2c..70c8ca64e 100644
--- a/youtube_dl/extractor/imgur.py
+++ b/youtube_dl/extractor/imgur.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
int_or_none,
js_to_json,
@@ -12,7 +13,7 @@ from ..utils import (
class ImgurIE(InfoExtractor):
- _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
@@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(
+ compat_urlparse.urljoin(url, video_id), video_id)
width = int_or_none(self._search_regex(
r'<param name="width" value="([0-9]+)"',
@@ -95,3 +97,28 @@ class ImgurIE(InfoExtractor):
'description': self._og_search_description(webpage),
'title': self._og_search_title(webpage),
}
+
+
+class ImgurAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)'
+
+ _TEST = {
+ 'url': 'http://imgur.com/gallery/Q95ko',
+ 'info_dict': {
+ 'id': 'Q95ko',
+ },
+ 'playlist_count': 25,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ album_images = self._download_json(
+ 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id,
+ album_id)['data']['images']
+
+ entries = [
+ self.url_result('http://imgur.com/%s' % image['hash'])
+ for image in album_images if image.get('hash')]
+
+ return self.playlist_result(entries, album_id)
diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py
index 0847074ee..65712abc2 100644
--- a/youtube_dl/extractor/ina.py
+++ b/youtube_dl/extractor/ina.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class InaIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_TEST = {
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py
new file mode 100644
index 000000000..12fb5e8e1
--- /dev/null
+++ b/youtube_dl/extractor/indavideo.py
@@ -0,0 +1,142 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_age_limit,
+ parse_iso8601,
+)
+
+
+class IndavideoEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
+ _TESTS = [{
+ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/',
+ 'md5': 'f79b009c66194acacd40712a6778acfa',
+ 'info_dict': {
+ 'id': '1837039',
+ 'ext': 'mp4',
+ 'title': 'Cicatánc',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'cukiajanlo',
+ 'uploader_id': '83729',
+ 'timestamp': 1439193826,
+ 'upload_date': '20150810',
+ 'duration': 72,
+ 'age_limit': 0,
+ 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'],
+ },
+ }, {
+ 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id,
+ video_id)['data']
+
+ title = video['title']
+
+ video_urls = video.get('video_files', [])
+ video_file = video.get('video_file')
+ if video:
+ video_urls.append(video_file)
+ video_urls = list(set(video_urls))
+
+ video_prefix = video_urls[0].rsplit('/', 1)[0]
+
+ for flv_file in video.get('flv_files', []):
+ flv_url = '%s/%s' % (video_prefix, flv_file)
+ if flv_url not in video_urls:
+ video_urls.append(flv_url)
+
+ formats = [{
+ 'url': video_url,
+ 'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None),
+ } for video_url in video_urls]
+ self._sort_formats(formats)
+
+ timestamp = video.get('date')
+ if timestamp:
+ # upload date is in CEST
+ timestamp = parse_iso8601(timestamp + ' +0200', ' ')
+
+ thumbnails = [{
+ 'url': self._proto_relative_url(thumbnail)
+ } for thumbnail in video.get('thumbnails', [])]
+
+ tags = [tag['title'] for tag in video.get('tags', [])]
+
+ return {
+ 'id': video.get('id') or video_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'thumbnails': thumbnails,
+ 'uploader': video.get('user_name'),
+ 'uploader_id': video.get('user_id'),
+ 'timestamp': timestamp,
+ 'duration': int_or_none(video.get('length')),
+ 'age_limit': parse_age_limit(video.get('age_limit')),
+ 'tags': tags,
+ 'formats': formats,
+ }
+
+
+class IndavideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'http://indavideo.hu/video/Vicces_cica_1',
+ 'md5': '8c82244ba85d2a2310275b318eb51eac',
+ 'info_dict': {
+ 'id': '1335611',
+ 'display_id': 'Vicces_cica_1',
+ 'ext': 'mp4',
+ 'title': 'Vicces cica',
+ 'description': 'Játszik a tablettel. :D',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Jet_Pack',
+ 'uploader_id': '491217',
+ 'timestamp': 1390821212,
+ 'upload_date': '20140127',
+ 'duration': 7,
+ 'age_limit': 0,
+ 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'],
+ },
+ }, {
+ 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+ embed_url = self._search_regex(
+ r'<link[^>]+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url')
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'IndavideoEmbed',
+ 'url': embed_url,
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index f25f43664..71cfd12c5 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -4,14 +4,15 @@ import base64
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urlparse,
)
class InfoQIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
+ _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
'info_dict': {
@@ -20,7 +21,10 @@ class InfoQIE(InfoExtractor):
'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
'title': 'A Few of My Favorite [Python] Things',
},
- }
+ }, {
+ 'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -35,14 +39,14 @@ class InfoQIE(InfoExtractor):
# Extract video URL
encoded_id = self._search_regex(
r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id')
- real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
+ real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
playpath = 'mp4:' + real_id
video_filename = playpath.split('/')[-1]
video_id, extension = video_filename.split('.')
http_base = self._search_regex(
- r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+ r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage,
'HTTP base URL')
formats = [{
@@ -52,7 +56,7 @@ class InfoQIE(InfoExtractor):
'play_path': playpath,
}, {
'format_id': 'http',
- 'url': http_base + real_id,
+ 'url': compat_urlparse.urljoin(url, http_base) + real_id,
}]
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 65f6ca103..3d78f78c4 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -3,13 +3,16 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ limit_length,
+)
class InstagramIE(InfoExtractor):
- _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
+ _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
_TEST = {
- 'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
+ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
@@ -41,11 +44,11 @@ class InstagramIE(InfoExtractor):
class InstagramUserIE(InfoExtractor):
- _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+ _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
_TEST = {
- 'url': 'http://instagram.com/porsche',
+ 'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
@@ -100,11 +103,13 @@ class InstagramUserIE(InfoExtractor):
thumbnails_el = it.get('images', {})
thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
- title = it.get('caption', {}).get('text', it['id'])
+ # In some cases caption is null, which corresponds to None
+ # in python. As a result, it.get('caption', {}) gives None
+ title = (it.get('caption') or {}).get('text', it['id'])
entries.append({
'id': it['id'],
- 'title': title,
+ 'title': limit_length(title, 80),
'formats': formats,
'thumbnail': thumbnail,
'webpage_url': it.get('link'),
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 8529bedfc..821c8ec10 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -11,11 +11,12 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ remove_end,
)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):
'id': '39152',
'ext': 'flv',
'title': 'Partička (92)',
- 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+ 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
@@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):
'id': '9718337',
'ext': 'flv',
'title': 'Tchibo Partička - Jarní móda',
- 'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
'thumbnail': 're:^http:.*\.jpg$',
},
'params': {
'skip_download': True, # requires rtmpdump
},
- 'skip': 'Do not have permission to access this page',
+ }, {
+ 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):
return {
'id': real_id,
- 'title': self._og_search_title(webpage),
+ 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
- 'description': self._og_search_description(webpage),
+ 'description': self._search_regex(
+ r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+ webpage, 'description', default=None),
}
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
new file mode 100644
index 000000000..393e67e35
--- /dev/null
+++ b/youtube_dl/extractor/iqiyi.py
@@ -0,0 +1,273 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import math
+import random
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import ExtractorError
+
+
+class IqiyiIE(InfoExtractor):
+ IE_NAME = 'iqiyi'
+ IE_DESC = '爱奇艺'
+
+ _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
+
+ _TESTS = [{
+ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+ 'md5': '2cb594dc2781e6c941a110d8f358118b',
+ 'info_dict': {
+ 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+ 'title': '美国德州空中惊现奇异云团 酷似UFO',
+ 'ext': 'f4v',
+ }
+ }, {
+ 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb',
+ 'title': '名侦探柯南第752集',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ _FORMATS_MAP = [
+ ('1', 'h6'),
+ ('2', 'h5'),
+ ('3', 'h4'),
+ ('4', 'h3'),
+ ('5', 'h2'),
+ ('10', 'h1'),
+ ]
+
+ def construct_video_urls(self, data, video_id, _uuid):
+ def do_xor(x, y):
+ a = y % 3
+ if a == 1:
+ return x ^ 121
+ if a == 2:
+ return x ^ 72
+ return x ^ 103
+
+ def get_encode_code(l):
+ a = 0
+ b = l.split('-')
+ c = len(b)
+ s = ''
+ for i in range(c - 1, -1, -1):
+ a = do_xor(int(b[c - i - 1], 16), i)
+ s += chr(a)
+ return s[::-1]
+
+ def get_path_key(x, format_id, segment_index):
+ mg = ')(*&^flash@#$%a'
+ tm = self._download_json(
+ 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
+ note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
+ )['t']
+ t = str(int(math.floor(int(tm) / (600.0))))
+ return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
+
+ video_urls_dict = {}
+ for format_item in data['vp']['tkl'][0]['vs']:
+ if 0 < int(format_item['bid']) <= 10:
+ format_id = self.get_format(format_item['bid'])
+ else:
+ continue
+
+ video_urls = []
+
+ video_urls_info = format_item['fs']
+ if not format_item['fs'][0]['l'].startswith('/'):
+ t = get_encode_code(format_item['fs'][0]['l'])
+ if t.endswith('mp4'):
+ video_urls_info = format_item['flvs']
+
+ for segment_index, segment in enumerate(video_urls_info):
+ vl = segment['l']
+ if not vl.startswith('/'):
+ vl = get_encode_code(vl)
+ key = get_path_key(
+ vl.split('/')[-1].split('.')[0], format_id, segment_index)
+ filesize = segment['b']
+ base_url = data['vp']['du'].split('/')
+ base_url.insert(-1, key)
+ base_url = '/'.join(base_url)
+ param = {
+ 'su': _uuid,
+ 'qyid': uuid.uuid4().hex,
+ 'client': '',
+ 'z': '',
+ 'bt': '',
+ 'ct': '',
+ 'tn': str(int(time.time()))
+ }
+ api_video_url = base_url + vl + '?' + \
+ compat_urllib_parse.urlencode(param)
+ js = self._download_json(
+ api_video_url, video_id,
+ note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
+ video_url = js['l']
+ video_urls.append(
+ (video_url, filesize))
+
+ video_urls_dict[format_id] = video_urls
+ return video_urls_dict
+
+ def get_format(self, bid):
+ matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
+ return matched_format_ids[0] if len(matched_format_ids) else None
+
+ def get_bid(self, format_id):
+ matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
+ return matched_bids[0] if len(matched_bids) else None
+
+ def get_raw_data(self, tvid, video_id, enc_key, _uuid):
+ tm = str(int(time.time()))
+ param = {
+ 'key': 'fvip',
+ 'src': hashlib.md5(b'youtube-dl').hexdigest(),
+ 'tvId': tvid,
+ 'vid': video_id,
+ 'vinfo': 1,
+ 'tm': tm,
+ 'enc': hashlib.md5(
+ (enc_key + tm + tvid).encode('utf8')).hexdigest(),
+ 'qyid': _uuid,
+ 'tn': random.random(),
+ 'um': 0,
+ 'authkey': hashlib.md5(
+ (tm + tvid).encode('utf8')).hexdigest()
+ }
+
+ api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
+ compat_urllib_parse.urlencode(param)
+ raw_data = self._download_json(api_url, video_id)
+ return raw_data
+
+ def get_enc_key(self, swf_url, video_id):
+ enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie
+ return enc_key
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(
+ url, 'temp_id', note='download video page')
+ tvid = self._search_regex(
+ r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+ video_id = self._search_regex(
+ r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+ swf_url = self._search_regex(
+ r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
+ _uuid = uuid.uuid4().hex
+
+ enc_key = self.get_enc_key(swf_url, video_id)
+
+ raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
+
+ if raw_data['code'] != 'A000000':
+ raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+ if not raw_data['data']['vp']['tkl']:
+ raise ExtractorError('No support iQiqy VIP video')
+
+ data = raw_data['data']
+
+ title = data['vi']['vn']
+
+ # generate video_urls_dict
+ video_urls_dict = self.construct_video_urls(
+ data, video_id, _uuid)
+
+ # construct info
+ entries = []
+ for format_id in video_urls_dict:
+ video_urls = video_urls_dict[format_id]
+ for i, video_url_info in enumerate(video_urls):
+ if len(entries) < i + 1:
+ entries.append({'formats': []})
+ entries[i]['formats'].append(
+ {
+ 'url': video_url_info[0],
+ 'filesize': video_url_info[-1],
+ 'format_id': format_id,
+ 'preference': int(self.get_bid(format_id))
+ }
+ )
+
+ for i in range(len(entries)):
+ self._sort_formats(entries[i]['formats'])
+ entries[i].update(
+ {
+ 'id': '%s_part%d' % (video_id, i + 1),
+ 'title': title,
+ }
+ )
+
+ if len(entries) > 1:
+ info = {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'title': title,
+ 'entries': entries,
+ }
+ else:
+ info = entries[0]
+ info['id'] = video_id
+ info['title'] = title
+
+ return info
diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py
new file mode 100644
index 000000000..214bcd5b5
--- /dev/null
+++ b/youtube_dl/extractor/ir90tv.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class Ir90TvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*'
+ _TESTS = [{
+ 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218',
+ 'md5': '411dbd94891381960cb9e13daa47a869',
+ 'info_dict': {
+ 'id': '95719',
+ 'ext': 'mp4',
+ 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = remove_start(self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title'), '90tv.ir :: ')
+
+ video_url = self._search_regex(
+ r'<source[^>]+src="([^"]+)"', webpage, 'video url')
+
+ thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)
+
+ return {
+ 'url': video_url,
+ 'id': video_id,
+ 'title': title,
+ 'video_url': video_url,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 99a1361f8..bc226fa67 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404302298,
+ 'timestamp': int,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163322193,
+ 'timestamp': int,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
uploader = self._html_search_regex(
r"adduserUsername\s*=\s*'([^']+)';",
- webpage, 'uploader', fatal=False, default='')
+ webpage, 'uploader', fatal=False)
timestamp = parse_iso8601(self._html_search_meta(
- 'uploadDate', webpage, 'upload date', fatal=False))
+ 'uploadDate', webpage, 'upload date'))
duration = float_or_none(self._html_search_regex(
r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
# Might be empty for some videos.
streams = self._html_search_regex(
- r'"qualitylevel"\s*:\s*"([^"]+)"',
- webpage, 'streams', fatal=False, default='')
+ r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
formats = []
if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
formats.append({
'format_id': '%sp' % quality if quality else 'sd',
- 'url': url,
+ 'url': compat_urllib_parse_unquote(url),
'ext': ext,
})
else:
stream_url = self._search_regex(
- r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+ r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
formats.append({
'format_id': 'sd',
- 'url': stream_url,
+ 'url': compat_urllib_parse_unquote(stream_url),
'ext': ext,
})
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index d0720ff56..1df084d87 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -8,9 +8,9 @@ from .common import InfoExtractor
class JeuxVideoIE(InfoExtractor):
- _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
+ _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
'info_dict': {
@@ -19,7 +19,10 @@ class JeuxVideoIE(InfoExtractor):
'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité',
'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.',
},
- }
+ }, {
+ 'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index d28730492..3dca0e566 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -13,12 +13,24 @@ from ..utils import (
class KalturaIE(InfoExtractor):
_VALID_URL = r'''(?x)
- (?:kaltura:|
- https?://(:?(?:www|cdnapisec)\.)?kaltura\.com/index\.php/kwidget/(?:[^/]+/)*?wid/_
- )(?P<partner_id>\d+)
- (?::|
- /(?:[^/]+/)*?entry_id/
- )(?P<id>[0-9a-z_]+)'''
+ (?:
+ kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+ https?://
+ (:?(?:www|cdnapisec)\.)?kaltura\.com/
+ (?:
+ (?:
+ # flash player
+ index\.php/kwidget/
+ (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
+ (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+ # html5 player
+ html5/html5lib/
+ (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
+ .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+ )
+ )
+ )
+ '''
_API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
_TESTS = [
{
@@ -43,6 +55,10 @@ class KalturaIE(InfoExtractor):
'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3',
'only_matching': True,
},
+ {
+ 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342',
+ 'only_matching': True,
+ }
]
def _kaltura_api_call(self, video_id, actions, *args, **kwargs):
@@ -105,9 +121,9 @@ class KalturaIE(InfoExtractor):
video_id, actions, note='Downloading video info JSON')
def _real_extract(self, url):
- video_id = self._match_id(url)
mobj = re.match(self._VALID_URL, url)
- partner_id, entry_id = mobj.group('partner_id'), mobj.group('id')
+ partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
+ entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
info, source_data = self._get_video_info(entry_id, partner_id)
@@ -126,7 +142,7 @@ class KalturaIE(InfoExtractor):
self._sort_formats(formats)
return {
- 'id': video_id,
+ 'id': entry_id,
'title': info['name'],
'formats': formats,
'description': info.get('description'),
diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py
index 2bb078036..4597d1b96 100644
--- a/youtube_dl/extractor/kanalplay.py
+++ b/youtube_dl/extractor/kanalplay.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
+ srt_subtitles_timecode,
)
@@ -39,8 +40,8 @@ class KanalPlayIE(InfoExtractor):
'%s\r\n%s --> %s\r\n%s'
% (
num,
- self._subtitles_timecode(item['startMillis'] / 1000.0),
- self._subtitles_timecode(item['endMillis'] / 1000.0),
+ srt_subtitles_timecode(item['startMillis'] / 1000.0),
+ srt_subtitles_timecode(item['endMillis'] / 1000.0),
item['text'],
) for num, item in enumerate(subs, 1))
diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py
index e3b43ff8d..06daf5a89 100644
--- a/youtube_dl/extractor/karaoketv.py
+++ b/youtube_dl/extractor/karaoketv.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
from ..utils import (
js_to_json,
)
@@ -24,7 +24,7 @@ class KaraoketvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
page_video_url = self._og_search_video_url(webpage, video_id)
- config_json = compat_urllib_parse.unquote_plus(self._search_regex(
+ config_json = compat_urllib_parse_unquote_plus(self._search_regex(
r'config=(.*)', page_video_url, 'configuration'))
urls_info_json = self._download_json(
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644
index 000000000..bed94bc93
--- /dev/null
+++ b/youtube_dl/extractor/karrierevideos.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ fix_xml_ampersands,
+ float_or_none,
+ xpath_with_ns,
+ xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+ 'info_dict': {
+ 'id': '32c91',
+ 'ext': 'flv',
+ 'title': 'AltenpflegerIn',
+ 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+ 'thumbnail': 're:^http://.*\.png',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # broken ampersands
+ 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+ 'info_dict': {
+ 'id': '5sniu',
+ 'ext': 'flv',
+ 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+ 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+ 'thumbnail': 're:^http://.*\.png',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = (self._html_search_meta('title', webpage, default=None) or
+ self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+ video_id = self._search_regex(
+ r'/config/video/(.+?)\.xml', webpage, 'video id')
+ playlist = self._download_xml(
+ 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+ video_id, transform_source=fix_xml_ampersands)
+
+ NS_MAP = {
+ 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+ }
+
+ def ns(path):
+ return xpath_with_ns(path, NS_MAP)
+
+ item = playlist.find('./tracklist/item')
+ video_file = xpath_text(
+ item, ns('./jwplayer:file'), 'video url', fatal=True)
+ streamer = xpath_text(
+ item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+ uploader = xpath_text(
+ item, ns('./jwplayer:author'), 'uploader')
+ duration = float_or_none(
+ xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+ description = self._html_search_regex(
+ r'(?s)<div class="leadtext">(.+?)</div>',
+ webpage, 'description')
+
+ thumbnail = self._html_search_meta(
+ 'thumbnail', webpage, 'thumbnail')
+ if thumbnail:
+ thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+ return {
+ 'id': video_id,
+ 'url': streamer.replace('rtmpt', 'rtmp'),
+ 'play_path': 'mp4:%s' % video_file,
+ 'ext': 'flv',
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py
index 7d4b57056..1d391e69f 100644
--- a/youtube_dl/extractor/kickstarter.py
+++ b/youtube_dl/extractor/kickstarter.py
@@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor):
'uploader': 'Pebble Technology',
'title': 'Pebble iOS Notifications',
}
+ }, {
+ 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+ 'info_dict': {
+ 'id': '1420158244',
+ 'ext': 'mp4',
+ 'title': 'Power Drive 2000',
+ },
+ 'expected_warnings': ['OpenGraph description'],
}]
def _real_extract(self, url):
@@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor):
'title': title,
}
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
+ if thumbnail is None:
+ thumbnail = self._html_search_regex(
+ r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+ webpage, 'thumbnail image', fatal=False)
return {
'id': video_id,
'url': video_url,
'title': title,
'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index 720bc939b..a59c529f4 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
class KontrTubeIE(InfoExtractor):
@@ -34,33 +37,28 @@ class KontrTubeIE(InfoExtractor):
webpage = self._download_webpage(
url, display_id, 'Downloading page')
- video_url = self._html_search_regex(
+ video_url = self._search_regex(
r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL')
- thumbnail = self._html_search_regex(
- r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False)
+ thumbnail = self._search_regex(
+ r"preview_url\s*:\s*'(.+?)/?',", webpage, 'thumbnail', fatal=False)
title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'video title')
+ r'(?s)<h2>(.+?)</h2>', webpage, 'title')
description = self._html_search_meta(
- 'description', webpage, 'video description')
+ 'description', webpage, 'description')
- mobj = re.search(
- r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
- webpage)
- duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
+ duration = self._search_regex(
+ r'Длительность: <em>([^<]+)</em>', webpage, 'duration', fatal=False)
+ if duration:
+ duration = parse_duration(duration.replace('мин', 'min').replace('сек', 'sec'))
- view_count = self._html_search_regex(
- r'<div class="col_2">Просмотров: <span>(\d+)</span></div>',
+ view_count = self._search_regex(
+ r'Просмотров: <em>([^<]+)</em>',
webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = int_or_none(view_count.replace(' ', ''))
- comment_count = None
- comment_str = self._html_search_regex(
- r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False)
- if comment_str.startswith('комментариев нет'):
- comment_count = 0
- else:
- mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
- if mobj:
- comment_count = mobj.group('total')
+ comment_count = int_or_none(self._search_regex(
+ r'Комментарии \((\d+)\)<', webpage, ' comment count', fatal=False))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py
index 96f95979a..0ae8ebd68 100644
--- a/youtube_dl/extractor/krasview.py
+++ b/youtube_dl/extractor/krasview.py
@@ -25,6 +25,9 @@ class KrasViewIE(InfoExtractor):
'duration': 27,
'thumbnail': 're:^https?://.*\.jpg',
},
+ 'params': {
+ 'skip_download': 'Not accessible from Travis CI server',
+ },
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
new file mode 100644
index 000000000..1077846f2
--- /dev/null
+++ b/youtube_dl/extractor/kuwo.py
@@ -0,0 +1,314 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ get_element_by_id,
+ clean_html,
+ ExtractorError,
+ remove_start,
+)
+
+
+class KuwoBaseIE(InfoExtractor):
+ _FORMATS = [
+ {'format': 'ape', 'ext': 'ape', 'preference': 100},
+ {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
+ {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
+ {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
+ {'format': 'wma', 'ext': 'wma', 'preference': 20},
+ {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
+ ]
+
+ def _get_formats(self, song_id):
+ formats = []
+ for file_format in self._FORMATS:
+ song_url = self._download_webpage(
+ 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' %
+ (file_format['ext'], file_format.get('br', ''), song_id),
+ song_id, note='Download %s url info' % file_format['format'],
+ )
+ if song_url.startswith('http://') or song_url.startswith('https://'):
+ formats.append({
+ 'url': song_url,
+ 'format_id': file_format['format'],
+ 'format': file_format['format'],
+ 'preference': file_format['preference'],
+ 'abr': file_format.get('abr'),
+ })
+ self._sort_formats(formats)
+ return formats
+
+
+class KuwoIE(KuwoBaseIE):
+ IE_NAME = 'kuwo:song'
+ IE_DESC = '酷我音乐'
+ _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/'
+ _TESTS = [{
+ 'url': 'http://www.kuwo.cn/yinyue/635632/',
+ 'info_dict': {
+ 'id': '635632',
+ 'ext': 'ape',
+ 'title': '爱我别走',
+ 'creator': '张震岳',
+ 'upload_date': '20080122',
+ 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
+ },
+ }, {
+ 'url': 'http://www.kuwo.cn/yinyue/6446136/',
+ 'info_dict': {
+ 'id': '6446136',
+ 'ext': 'mp3',
+ 'title': '心',
+ 'creator': 'IU',
+ 'upload_date': '20150518',
+ },
+ 'params': {
+ 'format': 'mp3-320'
+ },
+ }]
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, song_id, note='Download song detail info',
+ errnote='Unable to get song detail info')
+
+ song_name = self._html_search_regex(
+ r'<h1[^>]+title="([^"]+)">', webpage, 'song name')
+ singer_name = self._html_search_regex(
+ r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
+ webpage, 'singer name', fatal=False)
+ lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
+ if lrc_content == '暂无': # indicates no lyrics
+ lrc_content = None
+
+ formats = self._get_formats(song_id)
+
+ album_id = self._html_search_regex(
+ r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+ webpage, 'album id', fatal=False)
+
+ publish_time = None
+ if album_id is not None:
+ album_info_page = self._download_webpage(
+ 'http://www.kuwo.cn/album/%s/' % album_id, song_id,
+ note='Download album detail info',
+ errnote='Unable to get album detail info')
+
+ publish_time = self._html_search_regex(
+ r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page,
+ 'publish time', fatal=False)
+ if publish_time:
+ publish_time = publish_time.replace('-', '')
+
+ return {
+ 'id': song_id,
+ 'title': song_name,
+ 'creator': singer_name,
+ 'upload_date': publish_time,
+ 'description': lrc_content,
+ 'formats': formats,
+ }
+
+
+class KuwoAlbumIE(InfoExtractor):
+ IE_NAME = 'kuwo:album'
+ IE_DESC = '酷我音乐 - 专辑'
+ _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/'
+ _TEST = {
+ 'url': 'http://www.kuwo.cn/album/502294/',
+ 'info_dict': {
+ 'id': '502294',
+ 'title': 'M',
+ 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
+ },
+ 'playlist_count': 2,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ url, album_id, note='Download album info',
+ errnote='Unable to get album info')
+
+ album_name = self._html_search_regex(
+ r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
+ 'album name')
+ album_intro = remove_start(
+ clean_html(get_element_by_id('intro', webpage)),
+ '%s简介:' % album_name)
+
+ entries = [
+ self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+ r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
+ webpage)
+ ]
+ return self.playlist_result(entries, album_id, album_name, album_intro)
+
+
+class KuwoChartIE(InfoExtractor):
+ IE_NAME = 'kuwo:chart'
+ IE_DESC = '酷我音乐 - 排行榜'
+ _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
+ _TEST = {
+ 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
+ 'info_dict': {
+ 'id': '香港中文龙虎榜',
+ 'title': '香港中文龙虎榜',
+ 'description': 're:\d{4}第\d{2}期',
+ },
+ 'playlist_mincount': 10,
+ }
+
+ def _real_extract(self, url):
+ chart_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, chart_id, note='Download chart info',
+ errnote='Unable to get chart info')
+
+ chart_name = self._html_search_regex(
+ r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name')
+
+ chart_desc = self._html_search_regex(
+ r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc')
+
+ entries = [
+ self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+ r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage)
+ ]
+ return self.playlist_result(entries, chart_id, chart_name, chart_desc)
+
+
+class KuwoSingerIE(InfoExtractor):
+ IE_NAME = 'kuwo:singer'
+ IE_DESC = '酷我音乐 - 歌手'
+ _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
+ 'info_dict': {
+ 'id': 'bruno+mars',
+ 'title': 'Bruno Mars',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
+ 'info_dict': {
+ 'id': 'Ali',
+ 'title': 'Ali',
+ },
+ 'playlist_mincount': 95,
+ }]
+
+ def _real_extract(self, url):
+ singer_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, singer_id, note='Download singer info',
+ errnote='Unable to get singer info')
+
+ singer_name = self._html_search_regex(
+ r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name'
+ )
+
+ entries = []
+ first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True
+ for page_num in itertools.count(1):
+ webpage = self._download_webpage(
+ 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num),
+ singer_id, note='Download song list page #%d' % page_num,
+ errnote='Unable to get song list page #%d' % page_num)
+
+ entries.extend([
+ self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+ r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/',
+ webpage)
+ ][:10 if first_page_only else None])
+
+ if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage):
+ break
+
+ return self.playlist_result(entries, singer_id, singer_name)
+
+
+class KuwoCategoryIE(InfoExtractor):
+ IE_NAME = 'kuwo:category'
+ IE_DESC = '酷我音乐 - 分类'
+ _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
+ _TEST = {
+ 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
+ 'info_dict': {
+ 'id': '86375',
+ 'title': '八十年代精选',
+ 'description': '这些都是属于八十年代的回忆!',
+ },
+ 'playlist_count': 30,
+ }
+
+ def _real_extract(self, url):
+ category_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, category_id, note='Download category info',
+ errnote='Unable to get category info')
+
+ category_name = self._html_search_regex(
+ r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
+
+ category_desc = remove_start(
+ get_element_by_id('intro', webpage).strip(),
+ '%s简介:' % category_name)
+
+ jsonm = self._parse_json(self._html_search_regex(
+ r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
+
+ entries = [
+ self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo')
+ for song in jsonm['musiclist']
+ ]
+ return self.playlist_result(entries, category_id, category_name, category_desc)
+
+
+class KuwoMvIE(KuwoBaseIE):
+ IE_NAME = 'kuwo:mv'
+ IE_DESC = '酷我音乐 - MV'
+ _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/'
+ _TEST = {
+ 'url': 'http://www.kuwo.cn/mv/6480076/',
+ 'info_dict': {
+ 'id': '6480076',
+ 'ext': 'mkv',
+ 'title': '我们家MV',
+ 'creator': '2PM',
+ },
+ }
+ _FORMATS = KuwoBaseIE._FORMATS + [
+ {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
+ {'format': 'mp4', 'ext': 'mp4', 'preference': 200},
+ ]
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, song_id, note='Download mv detail info: %s' % song_id,
+ errnote='Unable to get mv detail info: %s' % song_id)
+
+ mobj = re.search(
+ r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
+ webpage)
+ if mobj:
+ song_name = mobj.group('song')
+ singer_name = mobj.group('singer')
+ else:
+ raise ExtractorError('Unable to find song or singer names')
+
+ formats = self._get_formats(song_id)
+
+ return {
+ 'id': song_id,
+ 'title': song_name,
+ 'creator': singer_name,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py
new file mode 100644
index 000000000..40a3d2346
--- /dev/null
+++ b/youtube_dl/extractor/lecture2go.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ parse_duration,
+ int_or_none,
+)
+
+
+class Lecture2GoIE(InfoExtractor):
+ _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473',
+ 'md5': 'ac02b570883020d208d405d5a3fd2f7f',
+ 'info_dict': {
+ 'id': '17473',
+ 'ext': 'flv',
+ 'title': '2 - Endliche Automaten und reguläre Sprachen',
+ 'creator': 'Frank Heitmann',
+ 'duration': 5220,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<em[^>]+class="title">(.+)</em>', webpage, 'title')
+
+ formats = []
+ for url in set(re.findall(r'"src","([^"]+)"', webpage)):
+ ext = determine_ext(url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(url, video_id))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(url, video_id))
+ else:
+ formats.append({
+ 'url': url,
+ })
+
+ self._sort_formats(formats)
+
+ creator = self._html_search_regex(
+ r'<div[^>]+id="description">([^<]+)</div>', webpage, 'creator', fatal=False)
+ duration = parse_duration(self._html_search_regex(
+ r'Duration:\s*</em>\s*<em[^>]*>([^<]+)</em>', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'Views:\s*</em>\s*<em[^>]+>(\d+)</em>', webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'creator': creator,
+ 'duration': duration,
+ 'view_count': view_count,
+ }
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py
index 1484ac0d2..a28abb0f0 100644
--- a/youtube_dl/extractor/letv.py
+++ b/youtube_dl/extractor/letv.py
@@ -15,10 +15,12 @@ from ..utils import (
determine_ext,
ExtractorError,
parse_iso8601,
+ int_or_none,
)
class LetvIE(InfoExtractor):
+ IE_DESC = '乐视网'
_VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
_TESTS = [{
@@ -50,9 +52,7 @@ class LetvIE(InfoExtractor):
'title': '与龙共舞 完整版',
'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
},
- 'params': {
- 'cn_verification_proxy': 'http://proxy.uku.im:8888'
- },
+ 'skip': 'Only available in China',
}]
@staticmethod
@@ -135,7 +135,7 @@ class LetvIE(InfoExtractor):
}
if format_id[-1:] == 'p':
- url_info_dict['height'] = format_id[:-1]
+ url_info_dict['height'] = int_or_none(format_id[:-1])
urls.append(url_info_dict)
diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py
index 9ab1416f5..d375695f5 100644
--- a/youtube_dl/extractor/libsyn.py
+++ b/youtube_dl/extractor/libsyn.py
@@ -8,9 +8,9 @@ from ..utils import unified_strdate
class LibsynIE(InfoExtractor):
- _VALID_URL = r'https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)'
+ _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
- _TEST = {
+ _TESTS = [{
'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/',
'md5': '443360ee1b58007bc3dcf09b41d093bb',
'info_dict': {
@@ -19,12 +19,24 @@ class LibsynIE(InfoExtractor):
'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
'description': 'md5:601cb790edd05908957dae8aaa866465',
'upload_date': '20150220',
+ 'thumbnail': 're:^https?://.*',
},
- }
+ }, {
+ 'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/',
+ 'md5': '6c5cb21acd622d754d3b1a92b582ce42',
+ 'info_dict': {
+ 'id': '3727166',
+ 'ext': 'mp3',
+ 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career',
+ 'upload_date': '20150818',
+ 'thumbnail': 're:^https?://.*',
+ }
+ }]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+ url = m.group('mainurl')
webpage = self._download_webpage(url, video_id)
formats = [{
@@ -32,20 +44,18 @@ class LibsynIE(InfoExtractor):
} for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
podcast_title = self._search_regex(
- r'<h2>([^<]+)</h2>', webpage, 'title')
+ r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None)
episode_title = self._search_regex(
- r'<h3>([^<]+)</h3>', webpage, 'title', default=None)
+ r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title')
title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
description = self._html_search_regex(
r'<div id="info_text_body">(.+?)</div>', webpage,
- 'description', fatal=False)
-
+ 'description', default=None)
thumbnail = self._search_regex(
r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"',
webpage, 'thumbnail', fatal=False)
-
release_date = unified_strdate(self._search_regex(
r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index 1dfe7f77f..f8cbca7b3 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -4,8 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
+ determine_ext,
int_or_none,
+ remove_end,
unified_strdate,
ExtractorError,
)
@@ -14,9 +17,9 @@ from ..utils import (
class LifeNewsIE(InfoExtractor):
IE_NAME = 'lifenews'
IE_DESC = 'LIFE | NEWS'
- _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
+ _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://lifenews.ru/news/126342',
'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
'info_dict': {
@@ -27,48 +30,139 @@ class LifeNewsIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
'upload_date': '20140130',
}
- }
+ }, {
+ # video in <iframe>
+ 'url': 'http://lifenews.ru/news/152125',
+ 'md5': '77d19a6f0886cd76bdbf44b4d971a273',
+ 'info_dict': {
+ 'id': '152125',
+ 'ext': 'mp4',
+ 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
+ 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+ 'upload_date': '20150402',
+ }
+ }, {
+ 'url': 'http://lifenews.ru/news/153461',
+ 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+ 'info_dict': {
+ 'id': '153461',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'upload_date': '20150505',
+ }
+ }, {
+ 'url': 'http://lifenews.ru/video/13035',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ section = mobj.group('section')
- webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
+ webpage = self._download_webpage(
+ 'http://lifenews.ru/%s/%s' % (section, video_id),
+ video_id, 'Downloading page')
videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
- if not videos:
+ iframe_link = self._html_search_regex(
+ '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
+ if not videos and not iframe_link:
raise ExtractorError('No media links available for %s' % video_id)
- title = self._og_search_title(webpage)
- TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
- if title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
+ title = remove_end(
+ self._og_search_title(webpage),
+ ' - Первый по срочным новостям — LIFE | NEWS')
description = self._og_search_description(webpage)
view_count = self._html_search_regex(
- r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
+ r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
comment_count = self._html_search_regex(
- r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
+ r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
+ webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
- r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False)
+ r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
+ common_info = {
+ 'description': description,
+ 'view_count': int_or_none(view_count),
+ 'comment_count': int_or_none(comment_count),
+ 'upload_date': upload_date,
+ }
+
def make_entry(video_id, media, video_number=None):
- return {
+ cur_info = dict(common_info)
+ cur_info.update({
'id': video_id,
'url': media[1],
'thumbnail': media[0],
'title': title if video_number is None else '%s-video%s' % (title, video_number),
- 'description': description,
- 'view_count': int_or_none(view_count),
- 'comment_count': int_or_none(comment_count),
- 'upload_date': upload_date,
- }
+ })
+ return cur_info
+
+ if iframe_link:
+ iframe_link = self._proto_relative_url(iframe_link, 'http:')
+ cur_info = dict(common_info)
+ cur_info.update({
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'title': title,
+ 'url': iframe_link,
+ })
+ return cur_info
if len(videos) == 1:
return make_entry(video_id, videos[0])
else:
return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+
+
+class LifeEmbedIE(InfoExtractor):
+ IE_NAME = 'life:embed'
+ _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
+
+ _TEST = {
+ 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
+ 'md5': 'b889715c9e49cb1981281d0e5458fbbe',
+ 'info_dict': {
+ 'id': 'e50c2dec2867350528e2574c899b8291',
+ 'ext': 'mp4',
+ 'title': 'e50c2dec2867350528e2574c899b8291',
+ 'thumbnail': 're:http://.*\.jpg',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+ for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
+ video_url = compat_urlparse.urljoin(url, video_url)
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='m3u8'))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': ext,
+ 'preference': 1,
+ })
+ self._sort_formats(formats)
+
+ thumbnail = self._search_regex(
+ r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None)
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 35822067f..857edfde2 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor):
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
}
+ }, {
+ # Covers https://github.com/rg3/youtube-dl/pull/5983
+ 'url': 'http://www.liveleak.com/view?i=801_1409392012',
+ 'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+ 'info_dict': {
+ 'id': '801_1409392012',
+ 'ext': 'mp4',
+ 'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+ 'uploader': 'bony333',
+ 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+ }
}]
def _real_extract(self, url):
@@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor):
'url': s['file'],
} for i, s in enumerate(sources)]
for i, s in enumerate(sources):
- orig_url = s['file'].replace('.h264_base.mp4', '')
+ # Removing '.h264_*.mp4' gives the raw video, which is essentially
+ # the same video without the LiveLeak logo at the top (see
+ # https://github.com/rg3/youtube-dl/pull/4768)
+ orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
if s['file'] != orig_url:
formats.append({
'format_id': 'original-%s' % i,
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index ec309dadd..6d7733e41 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -194,23 +194,19 @@ class LivestreamIE(InfoExtractor):
# The original version of Livestream uses a different system
class LivestreamOriginalIE(InfoExtractor):
IE_NAME = 'livestream:original'
- _VALID_URL = r'''(?x)https?://www\.livestream\.com/
+ _VALID_URL = r'''(?x)https?://original\.livestream\.com/
(?P<user>[^/]+)/(?P<type>video|folder)
(?:\?.*?Id=|/)(?P<id>.*?)(&|$)
'''
_TESTS = [{
- 'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+ 'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
'info_dict': {
'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
},
- 'params': {
- # rtmp
- 'skip_download': True,
- },
}, {
- 'url': 'https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+ 'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
'info_dict': {
'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3',
},
@@ -221,19 +217,17 @@ class LivestreamOriginalIE(InfoExtractor):
api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
info = self._download_xml(api_url, video_id)
+ # this url is used on mobile devices
+ stream_url = 'http://x{0}x.api.channel.livestream.com/3.0/getstream.json?id={1}'.format(user, video_id)
+ stream_info = self._download_json(stream_url, video_id)
item = info.find('channel').find('item')
ns = {'media': 'http://search.yahoo.com/mrss'}
thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
- # Remove the extension and number from the path (like 1.jpg)
- path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, 'path')
return {
'id': video_id,
'title': item.find('title').text,
- 'url': 'rtmp://extondemand.livestream.com/ondemand',
- 'play_path': 'trans/dv15/mogulus-{0}'.format(path),
- 'player_url': 'http://static.livestream.com/chromelessPlayer/v21/playerapi.swf?hash=5uetk&v=0803&classid=D27CDB6E-AE6D-11cf-96B8-444553540000&jsEnabled=false&wmode=opaque',
- 'ext': 'flv',
+ 'url': stream_info['progressiveUrl'],
'thumbnail': thumbnail_url,
}
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index cfd3b14f4..378117270 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -11,13 +11,13 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ clean_html,
int_or_none,
)
class LyndaBaseIE(InfoExtractor):
_LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
- _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
_NETRC_MACHINE = 'lynda'
@@ -30,18 +30,18 @@ class LyndaBaseIE(InfoExtractor):
return
login_form = {
- 'username': username,
- 'password': password,
+ 'username': username.encode('utf-8'),
+ 'password': password.encode('utf-8'),
'remember': 'false',
'stayPut': 'false'
}
request = compat_urllib_request.Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
login_page = self._download_webpage(
request, None, 'Logging in as %s' % username)
# Not (yet) logged in
- m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
+ m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page)
if m is not None:
response = m.group('json')
response_json = json.loads(response)
@@ -65,12 +65,21 @@ class LyndaBaseIE(InfoExtractor):
'stayPut': 'false',
}
request = compat_urllib_request.Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
+ self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8'))
login_page = self._download_webpage(
request, None,
'Confirming log in and log out from another device')
- if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
+ if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
+ if 'login error' in login_page:
+ mobj = re.search(
+ r'(?s)<h1[^>]+class="topmost">(?P<title>[^<]+)</h1>\s*<div>(?P<description>.+?)</div>',
+ login_page)
+ if mobj:
+ raise ExtractorError(
+ 'lynda returned error: %s - %s'
+ % (mobj.group('title'), clean_html(mobj.group('description'))),
+ expected=True)
raise ExtractorError('Unable to log in')
@@ -109,9 +118,7 @@ class LyndaIE(LyndaBaseIE):
'lynda returned error: %s' % video_json['Message'], expected=True)
if video_json['HasAccess'] is False:
- raise ExtractorError(
- 'Video %s is only available for members. '
- % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True)
+ self.raise_login_required('Video %s is only available for members' % video_id)
video_id = compat_str(video_json['ID'])
duration = video_json['DurationInSeconds']
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
index 54a14cb94..ab1300185 100644
--- a/youtube_dl/extractor/mailru.py
+++ b/youtube_dl/extractor/mailru.py
@@ -25,6 +25,7 @@ class MailRuIE(InfoExtractor):
'uploader_id': 'sonypicturesrus@mail.ru',
'duration': 184,
},
+ 'skip': 'Not accessible from Travis CI server',
},
{
'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
@@ -39,6 +40,7 @@ class MailRuIE(InfoExtractor):
'uploader_id': 'hitech@corp.mail.ru',
'duration': 245,
},
+ 'skip': 'Not accessible from Travis CI server',
},
]
diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py
index 0b85a59d1..92511a671 100644
--- a/youtube_dl/extractor/malemotion.py
+++ b/youtube_dl/extractor/malemotion.py
@@ -2,9 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
class MalemotionIE(InfoExtractor):
@@ -24,7 +22,7 @@ class MalemotionIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_url = compat_urllib_parse.unquote(self._search_regex(
+ video_url = compat_urllib_parse_unquote(self._search_regex(
r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 5fdd19027..fc7499958 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -29,7 +29,7 @@ class MDRIE(InfoExtractor):
doc = self._download_xml(domain + xmlurl, video_id)
formats = []
for a in doc.findall('./assets/asset'):
- url_el = a.find('.//progressiveDownloadUrl')
+ url_el = a.find('./progressiveDownloadUrl')
if url_el is None:
continue
abr = int(a.find('bitrateAudio').text) // 1000
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 8bc333b02..6e2e73a51 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_request,
)
from ..utils import (
@@ -155,7 +156,7 @@ class MetacafeIE(InfoExtractor):
video_url = None
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
if mobj is not None:
- mediaURL = compat_urllib_parse.unquote(mobj.group(1))
+ mediaURL = compat_urllib_parse_unquote(mobj.group(1))
video_ext = mediaURL[-3:]
# Extract gdaKey if available
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index d8897eb90..852d72266 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -5,6 +5,7 @@ import json
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import (
@@ -20,7 +21,6 @@ class MiTeleIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
- 'md5': '6a75fe9d0d3275bead0cb683c616fddb',
'info_dict': {
'id': '0fce117d',
'ext': 'mp4',
@@ -29,6 +29,10 @@ class MiTeleIE(InfoExtractor):
'display_id': 'programa-144',
'duration': 2913,
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -45,7 +49,7 @@ class MiTeleIE(InfoExtractor):
domain = 'http://' + domain
info_url = compat_urlparse.urljoin(
domain,
- compat_urllib_parse.unquote(embed_data['flashvars']['host'])
+ compat_urllib_parse_unquote(embed_data['flashvars']['host'])
)
info_el = self._download_xml(info_url, episode).find('./video/info')
@@ -56,12 +60,14 @@ class MiTeleIE(InfoExtractor):
episode,
transform_source=strip_jsonp
)
+ formats = self._extract_m3u8_formats(
+ token_info['tokenizedUrl'], episode, ext='mp4')
return {
'id': embed_data['videoId'],
'display_id': episode,
'title': info_el.find('title').text,
- 'url': token_info['tokenizedUrl'],
+ 'formats': formats,
'description': get_element_by_attribute('class', 'text', webpage),
'thumbnail': info_el.find('thumb').text,
'duration': parse_duration(info_el.find('duration').text),
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 425a4ccf1..d47aeceda 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -3,9 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
HEADRequest,
@@ -60,7 +58,7 @@ class MixcloudIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group(1)
cloudcast_name = mobj.group(2)
- track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
+ track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
webpage = self._download_webpage(url, track_id)
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index e369551c2..e242b897f 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -10,7 +10,21 @@ from ..utils import (
class MLBIE(InfoExtractor):
- _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:[\da-z_-]+\.)*mlb\.com/
+ (?:
+ (?:
+ (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+ (?:
+ shared/video/embed/(?:embed|m-internal-embed)\.html|
+ (?:[^/]+/)+(?:play|index)\.jsp|
+ )\?.*?\bcontent_id=
+ )
+ (?P<id>n?\d+)|
+ (?:[^/]+/)*(?P<path>[^/]+)
+ )
+ '''
_TESTS = [
{
'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -69,6 +83,18 @@ class MLBIE(InfoExtractor):
},
},
{
+ 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
+ 'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+ 'info_dict': {
+ 'id': '75609783',
+ 'ext': 'mp4',
+ 'title': 'Must C: Pillar climbs for catch',
+ 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
+ 'timestamp': 1429124820,
+ 'upload_date': '20150415',
+ }
+ },
+ {
'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
'only_matching': True,
},
@@ -83,6 +109,15 @@ class MLBIE(InfoExtractor):
{
'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728',
'only_matching': True,
+ },
+ {
+ # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer
+ 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#',
+ 'only_matching': True,
}
]
@@ -90,6 +125,12 @@ class MLBIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ if not video_id:
+ video_path = mobj.group('path')
+ webpage = self._download_webpage(url, video_path)
+ video_id = self._search_regex(
+ [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id')
+
detail = self._download_xml(
'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
% (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
index 2cec12d35..9bf99a54a 100644
--- a/youtube_dl/extractor/mofosex.py
+++ b/youtube_dl/extractor/mofosex.py
@@ -5,9 +5,9 @@ import re
from .common import InfoExtractor
from ..compat import (
+ compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
)
@@ -34,7 +34,7 @@ class MofosexIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title')
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
+ video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py
index 88dcd4f73..69e4bcd1a 100644
--- a/youtube_dl/extractor/moniker.py
+++ b/youtube_dl/extractor/moniker.py
@@ -9,7 +9,10 @@ from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ remove_start,
+)
class MonikerIE(InfoExtractor):
@@ -25,6 +28,14 @@ class MonikerIE(InfoExtractor):
'title': 'youtube-dl test video',
},
}, {
+ 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn',
+ 'md5': '710883dee1bfc370ecf9fa6a89307c88',
+ 'info_dict': {
+ 'id': 'jih3nce3x6wn',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video',
+ },
+ }, {
'url': 'http://vidspot.net/l2ngsmhs8ci5',
'md5': '710883dee1bfc370ecf9fa6a89307c88',
'info_dict': {
@@ -38,7 +49,10 @@ class MonikerIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ orig_video_id = self._match_id(url)
+ video_id = remove_start(orig_video_id, 'embed-')
+ url = url.replace(orig_video_id, video_id)
+ assert re.match(self._VALID_URL, url) is not None
orig_webpage = self._download_webpage(url, video_id)
if '>File Not Found<' in orig_webpage:
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index b48fac5e3..a597714e9 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -67,7 +67,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
return [{'url': url, 'ext': 'mp4'}]
def _extract_video_formats(self, mdoc, mtvn_id):
- if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None:
+ if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None:
if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
self.to_screen('The normal version is not available from your '
'country, trying with the mobile version')
@@ -114,7 +114,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
# Remove the templates, like &device={device}
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
if 'acceptMethods' not in mediagen_url:
- mediagen_url += '&acceptMethods=fms'
+ mediagen_url += '&' if '?' in mediagen_url else '?'
+ mediagen_url += 'acceptMethods=fms'
mediagen_doc = self._download_xml(mediagen_url, video_id,
'Downloading video urls')
@@ -141,7 +142,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
if title_el is None:
- title_el = itemdoc.find('.//title')
+ title_el = itemdoc.find('.//title') or itemdoc.find('./title')
if title_el.text is None:
title_el = None
@@ -174,8 +175,11 @@ class MTVServicesInfoExtractor(InfoExtractor):
if self._LANG:
info_url += 'lang=%s&' % self._LANG
info_url += data
+ return self._get_videos_info_from_url(info_url, video_id)
+
+ def _get_videos_info_from_url(self, url, video_id):
idoc = self._download_xml(
- info_url, video_id,
+ url, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
return self.playlist_result(
[self._get_video_info(item) for item in idoc.findall('.//item')])
@@ -288,3 +292,65 @@ class MTVIggyIE(MTVServicesInfoExtractor):
}
}
_FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/'
+
+
+class MTVDEIE(MTVServicesInfoExtractor):
+ IE_NAME = 'mtv.de'
+ _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$'
+ _TESTS = [{
+ 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum',
+ 'info_dict': {
+ 'id': 'music_video-a50bc5f0b3aa4b3190aa',
+ 'ext': 'mp4',
+ 'title': 'MusicVideo_cro-traum',
+ 'description': 'Cro - Traum',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
+ 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen',
+ 'info_dict': {
+ 'id': 'local_playlist-f5ae778b9832cc837189',
+ 'ext': 'mp4',
+ 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # single video in pagePlaylist with different id
+ 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3',
+ 'info_dict': {
+ 'id': 'local_playlist-4e760566473c4c8c5344',
+ 'ext': 'mp4',
+ 'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1',
+ 'description': 'MTV Movies Supercut',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'),
+ video_id)
+
+ # news pages contain single video in playlist with different id
+ if len(playlist) == 1:
+ return self._get_videos_info_from_url(playlist[0]['mrss'], video_id)
+
+ for item in playlist:
+ item_id = item.get('id')
+ if item_id and compat_str(item_id) == video_id:
+ return self._get_videos_info_from_url(item['mrss'], video_id)
diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py
new file mode 100644
index 000000000..66b523197
--- /dev/null
+++ b/youtube_dl/extractor/mwave.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
+
+
+class MwaveIE(InfoExtractor):
+ _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859',
+ 'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc',
+ 'info_dict': {
+ 'id': '168859',
+ 'ext': 'flv',
+ 'title': '[M COUNTDOWN] SISTAR - SHAKE IT',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'M COUNTDOWN',
+ 'duration': 206,
+ 'view_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ vod_info = self._download_json(
+ 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL&sectorid=&endinfo=Y&id=%s' % video_id,
+ video_id, 'Download vod JSON')
+
+ formats = []
+ for num, cdn_info in enumerate(vod_info['cdn']):
+ stream_url = cdn_info.get('url')
+ if not stream_url:
+ continue
+ stream_name = cdn_info.get('name') or compat_str(num)
+ f4m_stream = self._download_json(
+ stream_url, video_id,
+ 'Download %s stream JSON' % stream_name)
+ f4m_url = f4m_stream.get('fileurl')
+ if not f4m_url:
+ continue
+ formats.extend(
+ self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': vod_info['title'],
+ 'thumbnail': vod_info.get('cover'),
+ 'uploader': vod_info.get('program_title'),
+ 'duration': parse_duration(vod_info.get('time')),
+ 'view_count': int_or_none(vod_info.get('hit')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 5b9b9fbcd..4557a2b13 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -35,7 +35,8 @@ class MySpassIE(InfoExtractor):
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
- metadata = self._download_xml(metadata_url, video_id)
+ metadata = self._download_xml(
+ metadata_url, video_id, transform_source=lambda s: s.strip())
# extract values from metadata
url_flv_el = metadata.find('url_flv')
diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py
new file mode 100644
index 000000000..4c65be122
--- /dev/null
+++ b/youtube_dl/extractor/myvi.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .vimple import SprutoBaseIE
+
+
+class MyviIE(SprutoBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://
+ myvi\.(?:ru/player|tv)/
+ (?:
+ (?:
+ embed/html|
+ flash|
+ api/Video/Get
+ )/|
+ content/preloader\.swf\?.*\bid=
+ )
+ (?P<id>[\da-zA-Z_-]+)
+ '''
+ _TESTS = [{
+ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+ 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
+ 'info_dict': {
+ 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43',
+ 'ext': 'mp4',
+ 'title': 'хозяин жизни',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 25,
+ },
+ }, {
+ 'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ spruto = self._download_json(
+ 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData']
+
+ return self._extract_spruto(spruto, video_id)
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 5e754fcff..c96f472a3 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -10,6 +10,7 @@ from .common import InfoExtractor
from ..compat import (
compat_ord,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_request,
)
from ..utils import (
@@ -107,7 +108,7 @@ class MyVideoIE(InfoExtractor):
if not a == '_encxml':
params[a] = b
else:
- encxml = compat_urllib_parse.unquote(b)
+ encxml = compat_urllib_parse_unquote(b)
if not params.get('domain'):
params['domain'] = 'www.myvideo.de'
xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
@@ -135,7 +136,7 @@ class MyVideoIE(InfoExtractor):
video_url = None
mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
if mobj:
- video_url = compat_urllib_parse.unquote(mobj.group(1))
+ video_url = compat_urllib_parse_unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
self.report_warning(
'Rewriting URL to use unencrypted rtmp:// ...',
@@ -147,10 +148,10 @@ class MyVideoIE(InfoExtractor):
mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
if mobj is None:
raise ExtractorError('unable to extract url')
- video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+ video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2))
video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
- video_file = compat_urllib_parse.unquote(video_file)
+ video_file = compat_urllib_parse_unquote(video_file)
if not video_file.endswith('f4m'):
ppath, prefix = video_file.split('.')
@@ -159,7 +160,7 @@ class MyVideoIE(InfoExtractor):
video_playpath = ''
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
- video_swfobj = compat_urllib_parse.unquote(video_swfobj)
+ video_swfobj = compat_urllib_parse_unquote(video_swfobj)
video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
webpage, 'title')
diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py
index c18640c5a..6fc9e7b05 100644
--- a/youtube_dl/extractor/nationalgeographic.py
+++ b/youtube_dl/extractor/nationalgeographic.py
@@ -8,25 +8,40 @@ from ..utils import (
class NationalGeographicIE(InfoExtractor):
- _VALID_URL = r'http://video\.nationalgeographic\.com/video/.*?'
-
- _TEST = {
- 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
- 'info_dict': {
- 'id': '4DmDACA6Qtk_',
- 'ext': 'flv',
- 'title': 'Mating Crabs Busted by Sharks',
- 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+ _VALID_URL = r'http://video\.nationalgeographic\.com/.*?'
+
+ _TESTS = [
+ {
+ 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
+ 'info_dict': {
+ 'id': '4DmDACA6Qtk_',
+ 'ext': 'flv',
+ 'title': 'Mating Crabs Busted by Sharks',
+ 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+ },
+ 'add_ie': ['ThePlatform'],
},
- 'add_ie': ['ThePlatform'],
- }
+ {
+ 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws',
+ 'info_dict': {
+ 'id': '_JeBD_D7PlS5',
+ 'ext': 'flv',
+ 'title': 'The Real Jaws',
+ 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6',
+ },
+ 'add_ie': ['ThePlatform'],
+ },
+ ]
def _real_extract(self, url):
name = url_basename(url)
webpage = self._download_webpage(url, name)
- feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url')
- guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid')
+ feed_url = self._search_regex(
+ r'data-feed-url="([^"]+)"', webpage, 'feed url')
+ guid = self._search_regex(
+ r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"',
+ webpage, 'guid')
feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
content = feed.find('.//{http://search.yahoo.com/mrss/}content')
@@ -34,5 +49,6 @@ class NationalGeographicIE(InfoExtractor):
return self.url_result(smuggle_url(
'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
- # For some reason, the normal links don't work and we must force the use of f4m
+ # For some reason, the normal links don't work and we must force
+ # the use of f4m
{'force_smil_url': True}))
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index c10405f04..925967753 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
@@ -16,7 +17,7 @@ from ..utils import (
class NaverIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://tvcast.naver.com/v/81652',
'info_dict': {
'id': '81652',
@@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):
'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
'upload_date': '20130903',
},
- }
+ }, {
+ 'url': 'http://tvcast.naver.com/v/395837',
+ 'md5': '638ed4c12012c458fefcddfd01f173cd',
+ 'info_dict': {
+ 'id': '395837',
+ 'ext': 'mp4',
+ 'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+ 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+ 'upload_date': '20150519',
+ },
+ 'skip': 'Georestricted',
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):
webpage)
if m_id is None:
m_error = re.search(
- r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+ r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
webpage)
if m_error:
raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
@@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
domain = format_el.find('Domain').text
+ uri = format_el.find('uri').text
f = {
- 'url': domain + format_el.find('uri').text,
+ 'url': compat_urlparse.urljoin(domain, uri),
'ext': 'mp4',
'width': int(format_el.find('width').text),
'height': int(format_el.find('height').text),
}
if domain.startswith('rtmp'):
+ # urlparse does not support custom schemes
+ # https://bugs.python.org/issue18828
f.update({
+ 'url': domain + uri,
'ext': 'flv',
'rtmp_protocol': '1', # rtmpt
})
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 862b706bf..944096e1c 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
}, {
'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
'only_matching': True,
+ }, {
+ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+ 'info_dict': {
+ 'id': '0041400301-cle-atl-recap.nba',
+ 'ext': 'mp4',
+ 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+ 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+ 'duration': 228,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
}]
def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
description = self._og_search_description(webpage)
- duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration'))
+ duration_str = self._html_search_meta(
+ 'duration', webpage, 'duration', default=None)
+ if not duration_str:
+ duration_str = self._html_search_regex(
+ r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+ duration = parse_duration(duration_str)
return {
'id': shortened_video_id,
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index ecd0ac8b1..e683d24c4 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -10,6 +10,8 @@ from ..compat import (
from ..utils import (
ExtractorError,
find_xpath_attr,
+ lowercase_escape,
+ unescapeHTML,
)
@@ -37,14 +39,32 @@ class NBCIE(InfoExtractor):
},
'skip': 'Only works from US',
},
+ {
+ 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
+ 'info_dict': {
+ 'id': '8iUuyzWDdYUZ',
+ 'ext': 'flv',
+ 'title': 'Star Wars Teaser',
+ 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
+ },
+ 'skip': 'Only works from US',
+ },
+ {
+ # This video has expired but with an escaped embedURL
+ 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
+ 'skip': 'Expired'
+ }
]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- theplatform_url = self._search_regex(
- '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
- webpage, 'theplatform url').replace('_no_endcard', '')
+ theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
+ [
+ r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
+ r'"embedURL"\s*:\s*"([^"]+)"'
+ ],
+ webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
return self.url_result(theplatform_url)
@@ -104,7 +124,7 @@ class NBCSportsIE(InfoExtractor):
class NBCNewsIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
(?:video/.+?/(?P<id>\d+)|
- (?:feature|nightly-news)/[^/]+/(?P<title>.+))
+ (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+))
'''
_TESTS = [
@@ -149,6 +169,10 @@ class NBCNewsIE(InfoExtractor):
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
},
},
+ {
+ 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -212,3 +236,28 @@ class NBCNewsIE(InfoExtractor):
'url': info['videoAssets'][-1]['publicUrl'],
'ie_key': 'ThePlatform',
}
+
+
+class MSNBCIE(InfoExtractor):
+ # https URLs redirect to corresponding http ones
+ _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
+ 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
+ 'info_dict': {
+ 'id': 'n_hayes_Aimm_140801_272214',
+ 'ext': 'mp4',
+ 'title': 'The chaotic GOP immigration vote',
+ 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1406937606,
+ 'upload_date': '20140802',
+ 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ embed_url = self._html_search_meta('embedURL', webpage)
+ return self.url_result(embed_url)
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index f49c66690..79a13958b 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -8,41 +8,11 @@ from ..utils import (
ExtractorError,
int_or_none,
qualities,
+ parse_duration,
)
-class NDRIE(InfoExtractor):
- IE_NAME = 'ndr'
- IE_DESC = 'NDR.de - Mediathek'
- _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
-
- _TESTS = [
- {
- 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
- 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
- 'note': 'Video file',
- 'info_dict': {
- 'id': '25866',
- 'ext': 'mp4',
- 'title': 'Kartoffeltage in der Lewitz',
- 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
- 'duration': 166,
- }
- },
- {
- 'url': 'http://www.ndr.de/info/audio51535.html',
- 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
- 'note': 'Audio file',
- 'info_dict': {
- 'id': '51535',
- 'ext': 'mp3',
- 'title': 'La Valette entgeht der Hinrichtung',
- 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
- 'duration': 884,
- }
- }
- ]
-
+class NDRBaseIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
@@ -54,7 +24,11 @@ class NDRIE(InfoExtractor):
if description:
description = description.strip()
- duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
+ duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None))
+ if not duration:
+ duration = parse_duration(self._html_search_regex(
+ r'(<span class="min">\d+</span>:<span class="sec">\d+</span>)',
+ page, 'duration', default=None))
formats = []
@@ -92,3 +66,65 @@ class NDRIE(InfoExtractor):
'duration': duration,
'formats': formats,
}
+
+
+class NDRIE(NDRBaseIE):
+ IE_NAME = 'ndr'
+ IE_DESC = 'NDR.de - Mediathek'
+ _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
+ 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
+ 'note': 'Video file',
+ 'info_dict': {
+ 'id': '25866',
+ 'ext': 'mp4',
+ 'title': 'Kartoffeltage in der Lewitz',
+ 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
+ 'duration': 166,
+ },
+ 'skip': '404 Not found',
+ },
+ {
+ 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
+ 'md5': 'dadc003c55ae12a5d2f6bd436cd73f59',
+ 'info_dict': {
+ 'id': '988',
+ 'ext': 'mp4',
+ 'title': 'Party, Pötte und Parade',
+ 'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.',
+ 'duration': 3498,
+ },
+ },
+ {
+ 'url': 'http://www.ndr.de/info/audio51535.html',
+ 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+ 'note': 'Audio file',
+ 'info_dict': {
+ 'id': '51535',
+ 'ext': 'mp3',
+ 'title': 'La Valette entgeht der Hinrichtung',
+ 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
+ 'duration': 884,
+ }
+ }
+ ]
+
+
+class NJoyIE(NDRBaseIE):
+ IE_NAME = 'N-JOY'
+ _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P<id>\d+)\.html'
+
+ _TEST = {
+ 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
+ 'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+ 'info_dict': {
+ 'id': '2480',
+ 'ext': 'mp4',
+ 'title': 'Benaissa beim NDR Comedy Contest',
+ 'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.',
+ 'duration': 654,
+ }
+ }
diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py
new file mode 100644
index 000000000..a8e0a64ed
--- /dev/null
+++ b/youtube_dl/extractor/neteasemusic.py
@@ -0,0 +1,459 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from hashlib import md5
+from base64 import b64encode
+from datetime import datetime
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+ compat_urllib_parse,
+ compat_str,
+ compat_itertools_count,
+)
+
+
+class NetEaseMusicBaseIE(InfoExtractor):
+ _FORMATS = ['bMusic', 'mMusic', 'hMusic']
+ _NETEASE_SALT = '3go8&$8*3*3h0k(2)2'
+ _API_BASE = 'http://music.163.com/api/'
+
+ @classmethod
+ def _encrypt(cls, dfsid):
+ salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8'))
+ string_bytes = bytearray(compat_str(dfsid).encode('ascii'))
+ salt_len = len(salt_bytes)
+ for i in range(len(string_bytes)):
+ string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len]
+ m = md5()
+ m.update(bytes(string_bytes))
+ result = b64encode(m.digest()).decode('ascii')
+ return result.replace('/', '_').replace('+', '-')
+
+ @classmethod
+ def extract_formats(cls, info):
+ formats = []
+ for song_format in cls._FORMATS:
+ details = info.get(song_format)
+ if not details:
+ continue
+ formats.append({
+ 'url': 'http://m1.music.126.net/%s/%s.%s' %
+ (cls._encrypt(details['dfsId']), details['dfsId'],
+ details['extension']),
+ 'ext': details.get('extension'),
+ 'abr': details.get('bitrate', 0) / 1000,
+ 'format_id': song_format,
+ 'filesize': details.get('size'),
+ 'asr': details.get('sr')
+ })
+ return formats
+
+ @classmethod
+ def convert_milliseconds(cls, ms):
+ return int(round(ms / 1000.0))
+
+ def query_api(self, endpoint, video_id, note):
+ req = compat_urllib_request.Request('%s%s' % (self._API_BASE, endpoint))
+ req.add_header('Referer', self._API_BASE)
+ return self._download_json(req, video_id, note)
+
+
+class NetEaseMusicIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:song'
+ IE_DESC = '网易云音乐'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://music.163.com/#/song?id=32102397',
+ 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
+ 'info_dict': {
+ 'id': '32102397',
+ 'ext': 'mp3',
+ 'title': 'Bad Blood (feat. Kendrick Lamar)',
+ 'creator': 'Taylor Swift / Kendrick Lamar',
+ 'upload_date': '20150517',
+ 'timestamp': 1431878400,
+ 'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
+ },
+ }, {
+ 'note': 'No lyrics translation.',
+ 'url': 'http://music.163.com/#/song?id=29822014',
+ 'info_dict': {
+ 'id': '29822014',
+ 'ext': 'mp3',
+ 'title': '听见下雨的声音',
+ 'creator': '周杰伦',
+ 'upload_date': '20141225',
+ 'timestamp': 1419523200,
+ 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
+ },
+ }, {
+ 'note': 'No lyrics.',
+ 'url': 'http://music.163.com/song?id=17241424',
+ 'info_dict': {
+ 'id': '17241424',
+ 'ext': 'mp3',
+ 'title': 'Opus 28',
+ 'creator': 'Dustin O\'Halloran',
+ 'upload_date': '20080211',
+ 'timestamp': 1202745600,
+ },
+ }, {
+ 'note': 'Has translated name.',
+ 'url': 'http://music.163.com/#/song?id=22735043',
+ 'info_dict': {
+ 'id': '22735043',
+ 'ext': 'mp3',
+ 'title': '소원을 말해봐 (Genie)',
+ 'creator': '少女时代',
+ 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184',
+ 'upload_date': '20100127',
+ 'timestamp': 1264608000,
+ 'alt_title': '说出愿望吧(Genie)',
+ }
+ }]
+
+ def _process_lyrics(self, lyrics_info):
+ original = lyrics_info.get('lrc', {}).get('lyric')
+ translated = lyrics_info.get('tlyric', {}).get('lyric')
+
+ if not translated:
+ return original
+
+ lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)'
+ original_ts_texts = re.findall(lyrics_expr, original)
+ translation_ts_dict = dict(
+ (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated)
+ )
+ lyrics = '\n'.join([
+ '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, ''))
+ for time_stamp, text in original_ts_texts
+ ])
+ return lyrics
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+
+ params = {
+ 'id': song_id,
+ 'ids': '[%s]' % song_id
+ }
+ info = self.query_api(
+ 'song/detail?' + compat_urllib_parse.urlencode(params),
+ song_id, 'Downloading song info')['songs'][0]
+
+ formats = self.extract_formats(info)
+ self._sort_formats(formats)
+
+ lyrics_info = self.query_api(
+ 'song/lyric?id=%s&lv=-1&tv=-1' % song_id,
+ song_id, 'Downloading lyrics data')
+ lyrics = self._process_lyrics(lyrics_info)
+
+ alt_title = None
+ if info.get('transNames'):
+ alt_title = '/'.join(info.get('transNames'))
+
+ return {
+ 'id': song_id,
+ 'title': info['name'],
+ 'alt_title': alt_title,
+ 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]),
+ 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')),
+ 'thumbnail': info.get('album', {}).get('picUrl'),
+ 'duration': self.convert_milliseconds(info.get('duration', 0)),
+ 'description': lyrics,
+ 'formats': formats,
+ }
+
+
+class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:album'
+ IE_DESC = '网易云音乐 - 专辑'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://music.163.com/#/album?id=220780',
+ 'info_dict': {
+ 'id': '220780',
+ 'title': 'B\'day',
+ },
+ 'playlist_count': 23,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ info = self.query_api(
+ 'album/%s?id=%s' % (album_id, album_id),
+ album_id, 'Downloading album data')['album']
+
+ name = info['name']
+ desc = info.get('description')
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+ 'NetEaseMusic', song['id'])
+ for song in info['songs']
+ ]
+ return self.playlist_result(entries, album_id, name, desc)
+
+
+class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:singer'
+ IE_DESC = '网易云音乐 - 歌手'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'note': 'Singer has aliases.',
+ 'url': 'http://music.163.com/#/artist?id=10559',
+ 'info_dict': {
+ 'id': '10559',
+ 'title': '张惠妹 - aMEI;阿密特',
+ },
+ 'playlist_count': 50,
+ }, {
+ 'note': 'Singer has translated name.',
+ 'url': 'http://music.163.com/#/artist?id=124098',
+ 'info_dict': {
+ 'id': '124098',
+ 'title': '李昇基 - 이승기',
+ },
+ 'playlist_count': 50,
+ }]
+
+ def _real_extract(self, url):
+ singer_id = self._match_id(url)
+
+ info = self.query_api(
+ 'artist/%s?id=%s' % (singer_id, singer_id),
+ singer_id, 'Downloading singer data')
+
+ name = info['artist']['name']
+ if info['artist']['trans']:
+ name = '%s - %s' % (name, info['artist']['trans'])
+ if info['artist']['alias']:
+ name = '%s - %s' % (name, ';'.join(info['artist']['alias']))
+
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+ 'NetEaseMusic', song['id'])
+ for song in info['hotSongs']
+ ]
+ return self.playlist_result(entries, singer_id, name)
+
+
+class NetEaseMusicListIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:playlist'
+ IE_DESC = '网易云音乐 - 歌单'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://music.163.com/#/playlist?id=79177352',
+ 'info_dict': {
+ 'id': '79177352',
+ 'title': 'Billboard 2007 Top 100',
+ 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022'
+ },
+ 'playlist_count': 99,
+ }, {
+ 'note': 'Toplist/Charts sample',
+ 'url': 'http://music.163.com/#/discover/toplist?id=3733003',
+ 'info_dict': {
+ 'id': '3733003',
+ 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}',
+ 'description': 'md5:73ec782a612711cadc7872d9c1e134fc',
+ },
+ 'playlist_count': 50,
+ }]
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ info = self.query_api(
+ 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id,
+ list_id, 'Downloading playlist data')['result']
+
+ name = info['name']
+ desc = info.get('description')
+
+ if info.get('specialType') == 10: # is a chart/toplist
+ datestamp = datetime.fromtimestamp(
+ self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d')
+ name = '%s %s' % (name, datestamp)
+
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+ 'NetEaseMusic', song['id'])
+ for song in info['tracks']
+ ]
+ return self.playlist_result(entries, list_id, name, desc)
+
+
+class NetEaseMusicMvIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:mv'
+ IE_DESC = '网易云音乐 - MV'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://music.163.com/#/mv?id=415350',
+ 'info_dict': {
+ 'id': '415350',
+ 'ext': 'mp4',
+ 'title': '이럴거면 그러지말지',
+ 'description': '白雅言自作曲唱甜蜜爱情',
+ 'creator': '白雅言',
+ 'upload_date': '20150520',
+ },
+ }
+
+ def _real_extract(self, url):
+ mv_id = self._match_id(url)
+
+ info = self.query_api(
+ 'mv/detail?id=%s&type=mp4' % mv_id,
+ mv_id, 'Downloading mv info')['data']
+
+ formats = [
+ {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)}
+ for brs, mv_url in info['brs'].items()
+ ]
+ self._sort_formats(formats)
+
+ return {
+ 'id': mv_id,
+ 'title': info['name'],
+ 'description': info.get('desc') or info.get('briefDesc'),
+ 'creator': info['artistName'],
+ 'upload_date': info['publishTime'].replace('-', ''),
+ 'formats': formats,
+ 'thumbnail': info.get('cover'),
+ 'duration': self.convert_milliseconds(info.get('duration', 0)),
+ }
+
+
+class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:program'
+ IE_DESC = '网易云音乐 - 电台节目'
+ _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://music.163.com/#/program?id=10109055',
+ 'info_dict': {
+ 'id': '10109055',
+ 'ext': 'mp3',
+ 'title': '不丹足球背后的故事',
+ 'description': '喜马拉雅人的足球梦 ...',
+ 'creator': '大话西藏',
+ 'timestamp': 1434179342,
+ 'upload_date': '20150613',
+ 'duration': 900,
+ },
+ }, {
+ 'note': 'This program has accompanying songs.',
+ 'url': 'http://music.163.com/#/program?id=10141022',
+ 'info_dict': {
+ 'id': '10141022',
+ 'title': '25岁,你是自在如风的少年<27°C>',
+ 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'note': 'This program has accompanying songs.',
+ 'url': 'http://music.163.com/#/program?id=10141022',
+ 'info_dict': {
+ 'id': '10141022',
+ 'ext': 'mp3',
+ 'title': '25岁,你是自在如风的少年<27°C>',
+ 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+ 'timestamp': 1434450841,
+ 'upload_date': '20150616',
+ },
+ 'params': {
+ 'noplaylist': True
+ }
+ }]
+
+ def _real_extract(self, url):
+ program_id = self._match_id(url)
+
+ info = self.query_api(
+ 'dj/program/detail?id=%s' % program_id,
+ program_id, 'Downloading program info')['program']
+
+ name = info['name']
+ description = info['description']
+
+ if not info['songs'] or self._downloader.params.get('noplaylist'):
+ if info['songs']:
+ self.to_screen(
+ 'Downloading just the main audio %s because of --no-playlist'
+ % info['mainSong']['id'])
+
+ formats = self.extract_formats(info['mainSong'])
+ self._sort_formats(formats)
+
+ return {
+ 'id': program_id,
+ 'title': name,
+ 'description': description,
+ 'creator': info['dj']['brand'],
+ 'timestamp': self.convert_milliseconds(info['createTime']),
+ 'thumbnail': info['coverUrl'],
+ 'duration': self.convert_milliseconds(info.get('duration', 0)),
+ 'formats': formats,
+ }
+
+ self.to_screen(
+ 'Downloading playlist %s - add --no-playlist to just download the main audio %s'
+ % (program_id, info['mainSong']['id']))
+
+ song_ids = [info['mainSong']['id']]
+ song_ids.extend([song['id'] for song in info['songs']])
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song_id,
+ 'NetEaseMusic', song_id)
+ for song_id in song_ids
+ ]
+ return self.playlist_result(entries, program_id, name, description)
+
+
+class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:djradio'
+ IE_DESC = '网易云音乐 - 电台'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://music.163.com/#/djradio?id=42',
+ 'info_dict': {
+ 'id': '42',
+ 'title': '声音蔓延',
+ 'description': 'md5:766220985cbd16fdd552f64c578a6b15'
+ },
+ 'playlist_mincount': 40,
+ }
+ _PAGE_SIZE = 1000
+
+ def _real_extract(self, url):
+ dj_id = self._match_id(url)
+
+ name = None
+ desc = None
+ entries = []
+ for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE):
+ info = self.query_api(
+ 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d'
+ % (self._PAGE_SIZE, dj_id, offset),
+ dj_id, 'Downloading dj programs - %d' % offset)
+
+ entries.extend([
+ self.url_result(
+ 'http://music.163.com/#/program?id=%s' % program['id'],
+ 'NetEaseMusicProgram', program['id'])
+ for program in info['programs']
+ ])
+
+ if name is None:
+ radio = info['programs'][0]['radio']
+ name = radio['name']
+ desc = radio['desc']
+
+ if not info['more']:
+ break
+
+ return self.playlist_result(entries, dj_id, name, desc)
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py
index bc17e20aa..0d165a82a 100644
--- a/youtube_dl/extractor/netzkino.py
+++ b/youtube_dl/extractor/netzkino.py
@@ -49,7 +49,7 @@ class NetzkinoIE(InfoExtractor):
'http://www.netzkino.de/beta/dist/production.min.js', video_id,
note='Downloading player code')
avo_js = self._search_regex(
- r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+ r'var urlTemplate=(\{.*?"\})',
production_js, 'URL templates')
templates = self._parse_json(
avo_js, video_id, transform_source=js_to_json)
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
index 85fcad06b..5a9e73cd6 100644
--- a/youtube_dl/extractor/newstube.py
+++ b/youtube_dl/extractor/newstube.py
@@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page')
video_guid = self._html_search_regex(
- r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+ r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
page, 'video GUID')
player = self._download_xml(
diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py
index 02dba4ef6..c10784f6b 100644
--- a/youtube_dl/extractor/nextmedia.py
+++ b/youtube_dl/extractor/nextmedia.py
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601
class NextMediaIE(InfoExtractor):
+ IE_DESC = '蘋果日報'
_VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
@@ -66,6 +67,7 @@ class NextMediaIE(InfoExtractor):
class NextMediaActionNewsIE(NextMediaIE):
+ IE_DESC = '蘋果日報 - 動新聞'
_VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
_TESTS = [{
'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
@@ -89,8 +91,9 @@ class NextMediaActionNewsIE(NextMediaIE):
return self._extract_from_nextmedia_page(news_id, url, article_page)
-class AppleDailyRealtimeNewsIE(NextMediaIE):
- _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+ IE_DESC = '臺灣蘋果日報'
+ _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
_TESTS = [{
'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +102,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
'ext': 'mp4',
'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'md5:b23787119933404ce515c6356a8c355c',
+ 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
'upload_date': '20150128',
}
}, {
@@ -110,26 +113,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
'ext': 'mp4',
'title': '不滿被踩腳 山東兩大媽一路打下車',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+ 'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
'upload_date': '20150128',
}
- }]
-
- _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
- def _fetch_title(self, page):
- return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
- def _fetch_thumbnail(self, page):
- return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
- def _fetch_timestamp(self, page):
- return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
- _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
- _TESTS = [{
+ }, {
'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
'md5': '03df296d95dedc2d5886debbb80cb43f',
'info_dict': {
@@ -154,10 +141,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
'expected_warnings': [
'video thumbnail',
]
+ }, {
+ 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+ 'only_matching': True,
}]
+ _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
def _fetch_title(self, page):
- return self._html_search_meta('description', page, 'news title')
+ return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+ self._html_search_meta('description', page, 'news title'))
+
+ def _fetch_thumbnail(self, page):
+ return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+ def _fetch_timestamp(self, page):
+ return None
def _fetch_description(self, page):
return self._html_search_meta('description', page, 'news description')
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index 2684dd250..dc54634a5 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -19,7 +19,7 @@ class NFLIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
(?:.+?/)*
- (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+ (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
_TESTS = [
{
'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
@@ -58,6 +58,10 @@ class NFLIE(InfoExtractor):
'upload_date': '20150202',
},
},
+ {
+ 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+ 'only_matching': True,
+ }
]
@staticmethod
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 407465998..279b18386 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -21,6 +21,9 @@ class NHLBaseInfoExtractor(InfoExtractor):
return json_string.replace('\\\'', '\'')
def _real_extract_video(self, video_id):
+ vid_parts = video_id.split(',')
+ if len(vid_parts) == 3:
+ video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0'))
json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id
data = self._download_json(
json_url, video_id, transform_source=self._fix_json)
@@ -47,7 +50,7 @@ class NHLBaseInfoExtractor(InfoExtractor):
video_url = initial_video_url
join = compat_urlparse.urljoin
- return {
+ ret = {
'id': video_id,
'title': info['name'],
'url': video_url,
@@ -56,11 +59,20 @@ class NHLBaseInfoExtractor(InfoExtractor):
'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
'upload_date': unified_strdate(info['releaseDate'].split('.')[0]),
}
+ if video_url.startswith('rtmp:'):
+ mobj = re.match(r'(?P<tc_url>rtmp://[^/]+/(?P<app>[a-z0-9/]+))/(?P<play_path>mp4:.*)', video_url)
+ ret.update({
+ 'tc_url': mobj.group('tc_url'),
+ 'play_path': mobj.group('play_path'),
+ 'app': mobj.group('app'),
+ 'no_resume': True,
+ })
+ return ret
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)(?:id|hlg)=(?P<id>[-0-9a-zA-Z,]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
@@ -101,6 +113,29 @@ class NHLIE(NHLBaseInfoExtractor):
}, {
'url': 'http://video.nhl.com/videocenter/?id=736722',
'only_matching': True,
+ }, {
+ 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en',
+ 'md5': '076fcb88c255154aacbf0a7accc3f340',
+ 'info_dict': {
+ 'id': '2014020299-X-h',
+ 'ext': 'mp4',
+ 'title': 'Penguins at Islanders / Game Highlights',
+ 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014',
+ 'duration': 268,
+ 'upload_date': '20141122',
+ }
+ }, {
+ 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4',
+ 'info_dict': {
+ 'id': '691469',
+ 'ext': 'mp4',
+ 'title': 'RAW | Craig MacTavish Full Press Conference',
+ 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.',
+ 'upload_date': '20141205',
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ }
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 3cecebf95..0f8aa5ada 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -182,7 +182,6 @@ class NiconicoIE(InfoExtractor):
extension = xpath_text(video_info, './/movie_type')
if not extension:
extension = determine_ext(video_real_url)
- video_format = extension.upper()
thumbnail = (
xpath_text(video_info, './/thumbnail_url') or
@@ -241,7 +240,7 @@ class NiconicoIE(InfoExtractor):
'url': video_real_url,
'title': title,
'ext': extension,
- 'format': video_format,
+ 'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
'thumbnail': thumbnail,
'description': description,
'uploader': uploader,
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index 251e6da07..a53e27b27 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -14,7 +14,9 @@ from ..compat import (
from ..utils import (
clean_html,
ExtractorError,
- unified_strdate,
+ int_or_none,
+ float_or_none,
+ parse_iso8601,
)
@@ -25,21 +27,38 @@ class NocoIE(InfoExtractor):
_SUB_LANG_TEMPLATE = '&sub_lang=%s'
_NETRC_MACHINE = 'noco'
- _TEST = {
- 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
- 'md5': '0a993f0058ddbcd902630b2047ef710e',
- 'info_dict': {
- 'id': '11538',
- 'ext': 'mp4',
- 'title': 'Ami Ami Idol - Hello! France',
- 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
- 'upload_date': '20140412',
- 'uploader': 'Nolife',
- 'uploader_id': 'NOL',
- 'duration': 2851.2,
+ _TESTS = [
+ {
+ 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+ 'md5': '0a993f0058ddbcd902630b2047ef710e',
+ 'info_dict': {
+ 'id': '11538',
+ 'ext': 'mp4',
+ 'title': 'Ami Ami Idol - Hello! France',
+ 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+ 'upload_date': '20140412',
+ 'uploader': 'Nolife',
+ 'uploader_id': 'NOL',
+ 'duration': 2851.2,
+ },
+ 'skip': 'Requires noco account',
},
- 'skip': 'Requires noco account',
- }
+ {
+ 'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call',
+ 'md5': 'c190f1f48e313c55838f1f412225934d',
+ 'info_dict': {
+ 'id': '12610',
+ 'ext': 'mp4',
+ 'title': 'The Guild #1 - Wake-Up Call',
+ 'timestamp': 1403863200,
+ 'upload_date': '20140627',
+ 'uploader': 'LBL42',
+ 'uploader_id': 'LBL',
+ 'duration': 233.023,
+ },
+ 'skip': 'Requires noco account',
+ }
+ ]
def _real_initialize(self):
self._login()
@@ -90,51 +109,70 @@ class NocoIE(InfoExtractor):
'shows/%s/medias' % video_id,
video_id, 'Downloading video JSON')
+ show = self._call_api(
+ 'shows/by_id/%s' % video_id,
+ video_id, 'Downloading show JSON')[0]
+
+ options = self._call_api(
+ 'users/init', video_id,
+ 'Downloading user options JSON')['options']
+ audio_lang_pref = options.get('audio_language') or options.get('language', 'fr')
+
+ if audio_lang_pref == 'original':
+ audio_lang_pref = show['original_lang']
+ if len(medias) == 1:
+ audio_lang_pref = list(medias.keys())[0]
+ elif audio_lang_pref not in medias:
+ audio_lang_pref = 'fr'
+
qualities = self._call_api(
'qualities',
video_id, 'Downloading qualities JSON')
formats = []
- for lang, lang_dict in medias['fr']['video_list'].items():
- for format_id, fmt in lang_dict['quality_list'].items():
- format_id_extended = '%s-%s' % (lang, format_id) if lang != 'none' else format_id
-
- video = self._call_api(
- 'shows/%s/video/%s/fr' % (video_id, format_id.lower()),
- video_id, 'Downloading %s video JSON' % format_id_extended,
- lang if lang != 'none' else None)
-
- file_url = video['file']
- if not file_url:
- continue
-
- if file_url in ['forbidden', 'not found']:
- popmessage = video['popmessage']
- self._raise_error(popmessage['title'], popmessage['message'])
-
- formats.append({
- 'url': file_url,
- 'format_id': format_id_extended,
- 'width': fmt['res_width'],
- 'height': fmt['res_lines'],
- 'abr': fmt['audiobitrate'],
- 'vbr': fmt['videobitrate'],
- 'filesize': fmt['filesize'],
- 'format_note': qualities[format_id]['quality_name'],
- 'preference': qualities[format_id]['priority'],
- })
+ for audio_lang, audio_lang_dict in medias.items():
+ preference = 1 if audio_lang == audio_lang_pref else 0
+ for sub_lang, lang_dict in audio_lang_dict['video_list'].items():
+ for format_id, fmt in lang_dict['quality_list'].items():
+ format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id)
+
+ video = self._call_api(
+ 'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang),
+ video_id, 'Downloading %s video JSON' % format_id_extended,
+ sub_lang if sub_lang != 'none' else None)
+
+ file_url = video['file']
+ if not file_url:
+ continue
+
+ if file_url in ['forbidden', 'not found']:
+ popmessage = video['popmessage']
+ self._raise_error(popmessage['title'], popmessage['message'])
+
+ formats.append({
+ 'url': file_url,
+ 'format_id': format_id_extended,
+ 'width': int_or_none(fmt.get('res_width')),
+ 'height': int_or_none(fmt.get('res_lines')),
+ 'abr': int_or_none(fmt.get('audiobitrate')),
+ 'vbr': int_or_none(fmt.get('videobitrate')),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'format_note': qualities[format_id].get('quality_name'),
+ 'quality': qualities[format_id].get('priority'),
+ 'preference': preference,
+ })
self._sort_formats(formats)
- show = self._call_api(
- 'shows/by_id/%s' % video_id,
- video_id, 'Downloading show JSON')[0]
+ timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+ if timestamp is not None and timestamp < 0:
+ timestamp = None
- upload_date = unified_strdate(show['online_date_start_utc'])
- uploader = show['partner_name']
- uploader_id = show['partner_key']
- duration = show['duration_ms'] / 1000.0
+ uploader = show.get('partner_name')
+ uploader_id = show.get('partner_key')
+ duration = float_or_none(show.get('duration_ms'), 1000)
thumbnails = []
for thumbnail_key, thumbnail_url in show.items():
@@ -157,7 +195,7 @@ class NocoIE(InfoExtractor):
if episode_number:
title += ' #' + compat_str(episode_number)
if episode:
- title += ' - ' + episode
+ title += ' - ' + compat_str(episode)
description = show.get('show_resume') or show.get('family_resume')
@@ -166,7 +204,7 @@ class NocoIE(InfoExtractor):
'title': title,
'description': description,
'thumbnails': thumbnails,
- 'upload_date': upload_date,
+ 'timestamp': timestamp,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644
index 000000000..3f9c776ef
--- /dev/null
+++ b/youtube_dl/extractor/nova.py
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+ IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+ _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+ _TESTS = [{
+ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+ 'info_dict': {
+ 'id': '1608920',
+ 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+ 'ext': 'flv',
+ 'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+ 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+ 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+ 'info_dict': {
+ 'id': '1757139',
+ 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+ 'ext': 'mp4',
+ 'title': 'Podzemní nemocnice v pražské Krči',
+ 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'info_dict': {
+ 'id': '1756825',
+ 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'ext': 'flv',
+ 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+ 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+ 'info_dict': {
+ 'id': '1756858',
+ 'ext': 'flv',
+ 'title': 'Televizní noviny - 30. 5. 2015',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ 'upload_date': '20150530',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'info_dict': {
+ 'id': '1753621',
+ 'ext': 'mp4',
+ 'title': 'Zaklínač 3: Divoký hon',
+ 'description': 're:.*Pokud se stejně jako my nemůžete.*',
+ 'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+ 'upload_date': '20150521',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ site = mobj.group('site')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r"(?:media|video_id)\s*:\s*'(\d+)'",
+ r'media=(\d+)',
+ r'id="article_video_(\d+)"',
+ r'id="player_(\d+)"'],
+ webpage, 'video id')
+
+ config_url = self._search_regex(
+ r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+ webpage, 'config url', default=None)
+
+ if not config_url:
+ DEFAULT_SITE_ID = '23000'
+ SITES = {
+ 'tvnoviny': DEFAULT_SITE_ID,
+ 'novaplus': DEFAULT_SITE_ID,
+ 'vymena': DEFAULT_SITE_ID,
+ 'krasna': DEFAULT_SITE_ID,
+ 'fanda': '30',
+ 'tn': '30',
+ 'doma': '30',
+ }
+
+ site_id = self._search_regex(
+ r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+ config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+ % (site_id, video_id))
+
+ config = self._download_json(
+ config_url, display_id,
+ 'Downloading config JSON',
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+ mediafile = config['mediafile']
+ video_url = mediafile['src']
+
+ m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+ if m:
+ formats = [{
+ 'url': m.group('url'),
+ 'app': m.group('app'),
+ 'play_path': m.group('playpath'),
+ 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+ 'ext': 'flv',
+ }]
+ else:
+ formats = [{
+ 'url': video_url,
+ }]
+ self._sort_formats(formats)
+
+ title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+ description = clean_html(self._og_search_description(webpage, default=None))
+ thumbnail = config.get('poster')
+
+ if site == 'novaplus':
+ upload_date = unified_strdate(self._search_regex(
+ r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+ elif site == 'fanda':
+ upload_date = unified_strdate(self._search_regex(
+ r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+ else:
+ upload_date = None
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644
index 000000000..c8257719f
--- /dev/null
+++ b/youtube_dl/extractor/nowtv.py
@@ -0,0 +1,193 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ parse_duration,
+ remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)'
+
+ _TESTS = [{
+ # rtl
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+ 'info_dict': {
+ 'id': '203519',
+ 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'ext': 'flv',
+ 'title': 'Die neuen Bauern und eine Hochzeit',
+ 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432580700,
+ 'upload_date': '20150525',
+ 'duration': 2786,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # rtl2
+ 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+ 'info_dict': {
+ 'id': '203481',
+ 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+ 'ext': 'flv',
+ 'title': 'Berlin - Tag & Nacht (Folge 934)',
+ 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432666800,
+ 'upload_date': '20150526',
+ 'duration': 2641,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # rtlnitro
+ 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+ 'info_dict': {
+ 'id': '165780',
+ 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+ 'ext': 'flv',
+ 'title': 'Hals- und Beinbruch',
+ 'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432415400,
+ 'upload_date': '20150523',
+ 'duration': 2742,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # superrtl
+ 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+ 'info_dict': {
+ 'id': '99205',
+ 'display_id': 'medicopter-117/angst',
+ 'ext': 'flv',
+ 'title': 'Angst!',
+ 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1222632900,
+ 'upload_date': '20080928',
+ 'duration': 3025,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # ntv
+ 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+ 'info_dict': {
+ 'id': '203521',
+ 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+ 'ext': 'flv',
+ 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+ 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432751700,
+ 'upload_date': '20150527',
+ 'duration': 1083,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # vox
+ 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+ 'info_dict': {
+ 'id': '128953',
+ 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+ 'ext': 'flv',
+ 'title': "Büro-Fall / Chihuahua 'Joel'",
+ 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432408200,
+ 'upload_date': '20150523',
+ 'duration': 3092,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ display_id_split = display_id.split('/')
+ if len(display_id) > 2:
+ display_id = '/'.join((display_id_split[0], display_id_split[-1]))
+
+ info = self._download_json(
+ 'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id,
+ display_id)
+
+ video_id = compat_str(info['id'])
+
+ files = info['files']
+ if not files:
+ if info.get('geoblocked', False):
+ raise ExtractorError(
+ 'Video %s is not available from your location due to geo restriction' % video_id,
+ expected=True)
+ if not info.get('free', True):
+ raise ExtractorError(
+ 'Video %s is not available for free' % video_id, expected=True)
+
+ formats = []
+ for item in files['items']:
+ if determine_ext(item['path']) != 'f4v':
+ continue
+ app, play_path = remove_start(item['path'], '/').split('/', 1)
+ formats.append({
+ 'url': 'rtmpe://fms.rtl.de',
+ 'app': app,
+ 'play_path': 'mp4:%s' % play_path,
+ 'ext': 'flv',
+ 'page_url': url,
+ 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf',
+ 'tbr': int_or_none(item.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ title = info['title']
+ description = info.get('articleLong') or info.get('articleShort')
+ timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+ duration = parse_duration(info.get('duration'))
+
+ f = info.get('format', {})
+ thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 5d8448571..eb12fb810 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -1,5 +1,7 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
fix_xml_ampersands,
@@ -7,7 +9,6 @@ from ..utils import (
qualities,
strip_jsonp,
unified_strdate,
- url_basename,
)
@@ -16,13 +17,42 @@ class NPOBaseIE(InfoExtractor):
token_page = self._download_webpage(
'http://ida.omroep.nl/npoplayer/i.js',
video_id, note='Downloading token')
- return self._search_regex(
+ token = self._search_regex(
r'npoplayer\.token = "(.+?)"', token_page, 'token')
+ # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
+ token_l = list(token)
+ first = second = None
+ for i in range(5, len(token_l) - 4):
+ if token_l[i].isdigit():
+ if first is None:
+ first = i
+ elif second is None:
+ second = i
+ if first is None or second is None:
+ first = 12
+ second = 13
+
+ token_l[first], token_l[second] = token_l[second], token_l[first]
+
+ return ''.join(token_l)
class NPOIE(NPOBaseIE):
- IE_NAME = 'npo.nl'
- _VALID_URL = r'https?://(?:www\.)?npo\.nl/(?!live|radio)[^/]+/[^/]+/(?P<id>[^/?]+)'
+ IE_NAME = 'npo'
+ IE_DESC = 'npo.nl and ntr.nl'
+ _VALID_URL = r'''(?x)
+ (?:
+ npo:|
+ https?://
+ (?:www\.)?
+ (?:
+ npo\.nl/(?!live|radio)(?:[^/]+/){2}|
+ ntr\.nl/(?:[^/]+/){2,}|
+ omroepwnl\.nl/video/fragment/[^/]+__
+ )
+ )
+ (?P<id>[^/?#]+)
+ '''
_TESTS = [
{
@@ -42,7 +72,7 @@ class NPOIE(NPOBaseIE):
'info_dict': {
'id': 'VARA_101191800',
'ext': 'm4v',
- 'title': 'De Mega Mike & Mega Thomas show',
+ 'title': 'De Mega Mike & Mega Thomas show: The best of.',
'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
'upload_date': '20090227',
'duration': 2400,
@@ -54,8 +84,8 @@ class NPOIE(NPOBaseIE):
'info_dict': {
'id': 'VPWON_1169289',
'ext': 'm4v',
- 'title': 'Tegenlicht',
- 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+ 'title': 'Tegenlicht: De toekomst komt uit Afrika',
+ 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
'upload_date': '20130225',
'duration': 3000,
},
@@ -84,6 +114,30 @@ class NPOIE(NPOBaseIE):
'title': 'Hoe gaat Europa verder na Parijs?',
},
},
+ {
+ 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
+ 'md5': '01c6a2841675995da1f0cf776f03a9c3',
+ 'info_dict': {
+ 'id': 'VPWON_1233944',
+ 'ext': 'm4v',
+ 'title': 'Aap, poot, pies',
+ 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+ 'upload_date': '20150508',
+ 'duration': 599,
+ },
+ },
+ {
+ 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
+ 'md5': 'd30cd8417b8b9bca1fdff27428860d08',
+ 'info_dict': {
+ 'id': 'POW_00996502',
+ 'ext': 'm4v',
+ 'title': '''"Dit is wel een 'landslide'..."''',
+ 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
+ 'upload_date': '20150508',
+ 'duration': 462,
+ },
+ }
]
def _real_extract(self, url):
@@ -92,12 +146,24 @@ class NPOIE(NPOBaseIE):
def _get_info(self, video_id):
metadata = self._download_json(
- 'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
+ 'http://e.omroep.nl/metadata/%s' % video_id,
video_id,
# We have to remove the javascript callback
transform_source=strip_jsonp,
)
+ # For some videos actual video id (prid) is different (e.g. for
+ # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698
+ # video id is POMS_WNL_853698 but prid is POW_00996502)
+ video_id = metadata.get('prid') or video_id
+
+ # titel is too generic in some cases so utilize aflevering_titel as well
+ # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html)
+ title = metadata['titel']
+ sub_title = metadata.get('aflevering_titel')
+ if sub_title and sub_title != title:
+ title += ': %s' % sub_title
+
token = self._get_token(video_id)
formats = []
@@ -170,8 +236,8 @@ class NPOIE(NPOBaseIE):
return {
'id': video_id,
- 'title': metadata['titel'],
- 'description': metadata['info'],
+ 'title': title,
+ 'description': metadata.get('info'),
'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
'upload_date': unified_strdate(metadata.get('gidsdatum')),
'duration': parse_duration(metadata.get('tijdsduur')),
@@ -340,9 +406,9 @@ class NPORadioFragmentIE(InfoExtractor):
}
-class TegenlichtVproIE(NPOIE):
- IE_NAME = 'tegenlicht.vpro.nl'
- _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
+class VPROIE(NPOIE):
+ IE_NAME = 'vpro'
+ _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html'
_TESTS = [
{
@@ -351,17 +417,72 @@ class TegenlichtVproIE(NPOIE):
'info_dict': {
'id': 'VPWON_1169289',
'ext': 'm4v',
- 'title': 'Tegenlicht',
- 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+ 'title': 'De toekomst komt uit Afrika',
+ 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
'upload_date': '20130225',
},
},
+ {
+ 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html',
+ 'info_dict': {
+ 'id': 'sergio-herman',
+ 'title': 'Sergio Herman: Fucking perfect',
+ },
+ 'playlist_count': 2,
+ },
+ {
+ # playlist with youtube embed
+ 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html',
+ 'info_dict': {
+ 'id': 'education-education',
+ 'title': '2Doc',
+ },
+ 'playlist_count': 2,
+ }
]
def _real_extract(self, url):
- name = url_basename(url)
- webpage = self._download_webpage(url, name)
- urn = self._html_search_meta('mediaurn', webpage)
- info_page = self._download_json(
- 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name)
- return self._get_info(info_page['mid'])
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
+ for video_id in re.findall(r'data-media-id="([^"]+)"', webpage)
+ ]
+
+ playlist_title = self._search_regex(
+ r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>',
+ webpage, 'playlist title', default=None) or self._og_search_title(webpage)
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
+
+
+class WNLIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
+
+ _TEST = {
+ 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
+ 'info_dict': {
+ 'id': 'vandaag-de-dag-6-mei',
+ 'title': 'Vandaag de Dag 6 mei',
+ },
+ 'playlist_count': 4,
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('npo:%s' % video_id, 'NPO')
+ for video_id, part in re.findall(
+ r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage)
+ ]
+
+ playlist_title = self._html_search_regex(
+ r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>',
+ webpage, 'playlist title')
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index e91d3a248..d066a96db 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@@ -14,7 +13,7 @@ from ..utils import (
class NRKIE(InfoExtractor):
- _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
+ _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
_TESTS = [
{
@@ -77,7 +76,7 @@ class NRKIE(InfoExtractor):
class NRKPlaylistIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -117,11 +116,12 @@ class NRKPlaylistIE(InfoExtractor):
class NRKTVIE(InfoExtractor):
- _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+ IE_DESC = 'NRK TV and NRK Radio'
+ _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
_TESTS = [
{
- 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
'md5': 'adf2c5454fa2bf032f47a9f8fb351342',
'info_dict': {
'id': 'MUHH48000314',
@@ -133,7 +133,7 @@ class NRKTVIE(InfoExtractor):
},
},
{
- 'url': 'http://tv.nrk.no/program/mdfp15000514',
+ 'url': 'https://tv.nrk.no/program/mdfp15000514',
'md5': '383650ece2b25ecec996ad7b5bb2a384',
'info_dict': {
'id': 'mdfp15000514',
@@ -146,7 +146,7 @@ class NRKTVIE(InfoExtractor):
},
{
# single playlist video
- 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+ 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
'md5': 'adbd1dbd813edaf532b0a253780719c2',
'info_dict': {
'id': 'MSPO40010515-part2',
@@ -158,7 +158,7 @@ class NRKTVIE(InfoExtractor):
'skip': 'Only works from Norway',
},
{
- 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+ 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
'playlist': [
{
'md5': '9480285eff92d64f06e02a5367970a7a',
@@ -189,6 +189,10 @@ class NRKTVIE(InfoExtractor):
'duration': 6947.5199999999995,
},
'skip': 'Only works from Norway',
+ },
+ {
+ 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+ 'only_matching': True,
}
]
@@ -200,24 +204,15 @@ class NRKTVIE(InfoExtractor):
url = "%s%s" % (baseurl, subtitlesurl)
self._debug_print('%s: Subtitle url: %s' % (video_id, url))
captions = self._download_xml(
- url, video_id, 'Downloading subtitles',
- transform_source=lambda s: s.replace(r'<br />', '\r\n'))
+ url, video_id, 'Downloading subtitles')
lang = captions.get('lang', 'no')
- ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}'))
- srt = ''
- for pos, p in enumerate(ps):
- begin = parse_duration(p.get('begin'))
- duration = parse_duration(p.get('dur'))
- starttime = self._subtitles_timecode(begin)
- endtime = self._subtitles_timecode(begin + duration)
- srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text)
return {lang: [
{'ext': 'ttml', 'url': url},
- {'ext': 'srt', 'data': srt},
]}
def _extract_f4m(self, manifest_url, video_id):
- return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
+ return self._extract_f4m_formats(
+ manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -279,7 +274,7 @@ class NRKTVIE(InfoExtractor):
m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
if m3u8_url:
- formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
self._sort_formats(formats)
subtitles_url = self._html_search_regex(
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
index 03f0a4de6..7f254b867 100644
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@@ -8,30 +8,8 @@ from ..utils import (
)
-class NYTimesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
-
- _TESTS = [{
- 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
- 'md5': '18a525a510f942ada2720db5f31644c0',
- 'info_dict': {
- 'id': '100000002847155',
- 'ext': 'mov',
- 'title': 'Verbatim: What Is a Photocopier?',
- 'description': 'md5:93603dada88ddbda9395632fdc5da260',
- 'timestamp': 1398631707,
- 'upload_date': '20140427',
- 'uploader': 'Brett Weiner',
- 'duration': 419,
- }
- }, {
- 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
+class NYTimesBaseIE(InfoExtractor):
+ def _extract_video_from_id(self, video_id):
video_data = self._download_json(
'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id,
video_id, 'Downloading video JSON')
@@ -81,3 +59,59 @@ class NYTimesIE(InfoExtractor):
'formats': formats,
'thumbnails': thumbnails,
}
+
+
+class NYTimesIE(NYTimesBaseIE):
+ _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
+ 'md5': '18a525a510f942ada2720db5f31644c0',
+ 'info_dict': {
+ 'id': '100000002847155',
+ 'ext': 'mov',
+ 'title': 'Verbatim: What Is a Photocopier?',
+ 'description': 'md5:93603dada88ddbda9395632fdc5da260',
+ 'timestamp': 1398631707,
+ 'upload_date': '20140427',
+ 'uploader': 'Brett Weiner',
+ 'duration': 419,
+ }
+ }, {
+ 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ return self._extract_video_from_id(video_id)
+
+
+class NYTimesArticleIE(NYTimesBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
+ _TESTS = [{
+ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
+ 'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
+ 'info_dict': {
+ 'id': '100000003628438',
+ 'ext': 'mov',
+ 'title': 'New Minimum Wage: $70,000 a Year',
+ 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
+ 'timestamp': 1429033037,
+ 'upload_date': '20150414',
+ 'uploader': 'Matthew Williams',
+ }
+ }, {
+ 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id')
+
+ return self._extract_video_from_id(video_id)
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index 155d0ee6a..003d27de7 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -2,18 +2,21 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
unified_strdate,
int_or_none,
qualities,
+ unescapeHTML,
)
class OdnoklassnikiIE(InfoExtractor):
- _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
_TESTS = [{
+ # metadata in JSON
'url': 'http://ok.ru/video/20079905452',
- 'md5': '8e24ad2da6f387948e7a7d44eb8668fe',
+ 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc',
'info_dict': {
'id': '20079905452',
'ext': 'mp4',
@@ -26,6 +29,21 @@ class OdnoklassnikiIE(InfoExtractor):
'age_limit': 0,
},
}, {
+ # metadataUrl
+ 'url': 'http://ok.ru/video/63567059965189-0',
+ 'md5': '9676cf86eff5391d35dea675d224e131',
+ 'info_dict': {
+ 'id': '63567059965189-0',
+ 'ext': 'mp4',
+ 'title': 'Девушка без комплексов ...',
+ 'duration': 191,
+ 'upload_date': '20150518',
+ 'uploader_id': '534380003155',
+ 'uploader': '☭ Андрей Мещанинов ☭',
+ 'like_count': int,
+ 'age_limit': 0,
+ },
+ }, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
'only_matching': True,
}]
@@ -33,14 +51,23 @@ class OdnoklassnikiIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(
+ 'http://ok.ru/video/%s' % video_id, video_id)
player = self._parse_json(
- self._search_regex(
- r"OKVideo\.start\(({.+?})\s*,\s*'VideoAutoplay_player'", webpage, 'player'),
+ unescapeHTML(self._search_regex(
+ r'data-attributes="([^"]+)"', webpage, 'player')),
video_id)
- metadata = self._parse_json(player['flashvars']['metadata'], video_id)
+ flashvars = player['flashvars']
+
+ metadata = flashvars.get('metadata')
+ if metadata:
+ metadata = self._parse_json(metadata, video_id)
+ else:
+ metadata = self._download_json(
+ compat_urllib_parse_unquote(flashvars['metadataUrl']),
+ video_id, 'Downloading metadata JSON')
movie = metadata['movie']
title = movie['title']
@@ -52,11 +79,11 @@ class OdnoklassnikiIE(InfoExtractor):
uploader = author.get('name')
upload_date = unified_strdate(self._html_search_meta(
- 'ya:ovs:upload_date', webpage, 'upload date'))
+ 'ya:ovs:upload_date', webpage, 'upload date', default=None))
age_limit = None
adult = self._html_search_meta(
- 'ya:ovs:adult', webpage, 'age limit')
+ 'ya:ovs:adult', webpage, 'age limit', default=None)
if adult:
age_limit = 18 if adult == 'true' else 0
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
new file mode 100644
index 000000000..0f1f448fe
--- /dev/null
+++ b/youtube_dl/extractor/onionstudios.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class OnionStudiosIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P<id>\d+)(?!-)'
+
+ _TESTS = [{
+ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
+ 'md5': 'd4851405d31adfadf71cd7a487b765bb',
+ 'info_dict': {
+ 'id': '2937',
+ 'ext': 'mp4',
+ 'title': 'Hannibal charges forward, stops for a cocktail',
+ 'description': 'md5:545299bda6abf87e5ec666548c6a9448',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'The A.V. Club',
+ 'uploader_id': 'TheAVClub',
+ },
+ }, {
+ 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+
+ formats = []
+ for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
+ if determine_ext(src) != 'm3u8': # m3u8 always results in 403
+ formats.append({
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
+ webpage, 'title', group='title')
+ description = self._search_regex(
+ r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1',
+ webpage, 'description', default=None, group='description')
+ thumbnail = self._search_regex(
+ r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
+ webpage, 'thumbnail', default=False, group='thumbnail')
+
+ uploader_id = self._search_regex(
+ r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
+ webpage, 'uploader id', fatal=False, group='uploader_id')
+ uploader = self._search_regex(
+ r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
+ webpage, 'uploader', default=False, group='uploader')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index d5b05c18f..a262a9f6d 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -1,63 +1,41 @@
from __future__ import unicode_literals
import re
import json
+import base64
from .common import InfoExtractor
from ..utils import (
unescapeHTML,
ExtractorError,
+ determine_ext,
+ int_or_none,
)
-class OoyalaIE(InfoExtractor):
- _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
-
- _TESTS = [
- {
- # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
- 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'info_dict': {
- 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'ext': 'mp4',
- 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
- 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
- },
- }, {
- # Only available for ipad
- 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'info_dict': {
- 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'ext': 'mp4',
- 'title': 'Simulation Overview - Levels of Simulation',
- 'description': '',
- },
- },
- ]
+class OoyalaBaseIE(InfoExtractor):
- @staticmethod
- def _url_for_embed_code(embed_code):
- return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+ def _extract_result(self, info, more_info):
+ embedCode = info['embedCode']
+ video_url = info.get('ipad_url') or info['url']
- @classmethod
- def _build_url_result(cls, embed_code):
- return cls.url_result(cls._url_for_embed_code(embed_code),
- ie=cls.ie_key())
+ if determine_ext(video_url) == 'm3u8':
+ formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4')
+ else:
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ }]
- def _extract_result(self, info, more_info):
return {
- 'id': info['embedCode'],
- 'ext': 'mp4',
+ 'id': embedCode,
'title': unescapeHTML(info['title']),
- 'url': info.get('ipad_url') or info['url'],
+ 'formats': formats,
'description': unescapeHTML(more_info['description']),
'thumbnail': more_info['promo'],
}
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- embedCode = mobj.group('id')
- player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
- player = self._download_webpage(player_url, embedCode)
+ def _extract(self, player_url, video_id):
+ player = self._download_webpage(player_url, video_id)
mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
player, 'mobile player url')
# Looks like some videos are only available for particular devices
@@ -70,13 +48,43 @@ class OoyalaIE(InfoExtractor):
devices.insert(0, 'unknown')
for device in devices:
mobile_player = self._download_webpage(
- '%s&device=%s' % (mobile_url, device), embedCode,
+ '%s&device=%s' % (mobile_url, device), video_id,
'Downloading mobile player JS for %s device' % device)
videos_info = self._search_regex(
r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
mobile_player, 'info', fatal=False, default=None)
if videos_info:
break
+
+ if not videos_info:
+ formats = []
+ auth_data = self._download_json(
+ 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id),
+ video_id)
+
+ cur_auth_data = auth_data['authorization_data'][video_id]
+
+ for stream in cur_auth_data['streams']:
+ formats.append({
+ 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'),
+ 'ext': stream.get('delivery_type'),
+ 'format': stream.get('video_codec'),
+ 'format_id': stream.get('profile'),
+ 'width': int_or_none(stream.get('width')),
+ 'height': int_or_none(stream.get('height')),
+ 'abr': int_or_none(stream.get('audio_bitrate')),
+ 'vbr': int_or_none(stream.get('video_bitrate')),
+ })
+ if formats:
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': 'Ooyala video',
+ }
+
+ if not cur_auth_data['authorized']:
+ raise ExtractorError(cur_auth_data['message'], expected=True)
+
if not videos_info:
raise ExtractorError('Unable to extract info')
videos_info = videos_info.replace('\\"', '"')
@@ -89,9 +97,100 @@ class OoyalaIE(InfoExtractor):
videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
return {
'_type': 'playlist',
- 'id': embedCode,
+ 'id': video_id,
'title': unescapeHTML(videos_more_info['title']),
'entries': videos,
}
else:
return self._extract_result(videos_info[0], videos_more_info)
+
+
+class OoyalaIE(OoyalaBaseIE):
+ _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
+
+ _TESTS = [
+ {
+ # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
+ 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'info_dict': {
+ 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'ext': 'mp4',
+ 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+ 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+ },
+ }, {
+ # Only available for ipad
+ 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+ 'info_dict': {
+ 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+ 'ext': 'mp4',
+ 'title': 'Simulation Overview - Levels of Simulation',
+ 'description': '',
+ },
+ },
+ {
+ # Information available only through SAS api
+ # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
+ 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+ 'md5': 'a84001441b35ea492bc03736e59e7935',
+ 'info_dict': {
+ 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+ 'ext': 'mp4',
+ 'title': 'Ooyala video',
+ }
+ }
+ ]
+
+ @staticmethod
+ def _url_for_embed_code(embed_code):
+ return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+
+ @classmethod
+ def _build_url_result(cls, embed_code):
+ return cls.url_result(cls._url_for_embed_code(embed_code),
+ ie=cls.ie_key())
+
+ def _real_extract(self, url):
+ embed_code = self._match_id(url)
+ player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+ return self._extract(player_url, embed_code)
+
+
+class OoyalaExternalIE(OoyalaBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ ooyalaexternal:|
+ https?://.+?\.ooyala\.com/.*?\bexternalId=
+ )
+ (?P<partner_id>[^:]+)
+ :
+ (?P<id>.+)
+ (?:
+ :|
+ .*?&pcode=
+ )
+ (?P<pcode>.+?)
+ (&|$)
+ '''
+
+ _TEST = {
+ 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always',
+ 'info_dict': {
+ 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+ 'ext': 'mp4',
+ 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+ 'description': '',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ partner_id = mobj.group('partner_id')
+ video_id = mobj.group('id')
+ pcode = mobj.group('pcode')
+ player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode)
+ return self._extract(player_url, video_id)
diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py
index 2249657eb..d2ceedd01 100644
--- a/youtube_dl/extractor/openfilm.py
+++ b/youtube_dl/extractor/openfilm.py
@@ -3,9 +3,9 @@ from __future__ import unicode_literals
import json
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
from ..utils import (
parse_iso8601,
- compat_urllib_parse,
parse_age_limit,
int_or_none,
)
@@ -37,7 +37,7 @@ class OpenFilmIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- player = compat_urllib_parse.unquote_plus(
+ player = compat_urllib_parse_unquote_plus(
self._og_search_video_url(webpage))
video = json.loads(self._search_regex(
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index f179ea200..6cdc2638b 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
embed = self._html_search_regex(
- r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+ r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
webpage, 'embedded URL', default=None)
if attach_fn is not None:
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 761bd6d8d..683c81de3 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -31,10 +32,13 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2365006249',
'ext': 'mp4',
- 'title': 'A More Perfect Union',
+ 'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
'duration': 3190,
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
@@ -42,10 +46,13 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2365297690',
'ext': 'mp4',
- 'title': 'Losing Iraq',
+ 'title': 'FRONTLINE - Losing Iraq',
'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
'duration': 5050,
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
},
{
'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -53,7 +60,7 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2201174722',
'ext': 'mp4',
- 'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
+ 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
'duration': 801,
},
@@ -65,10 +72,13 @@ class PBSIE(InfoExtractor):
'id': '2365297708',
'ext': 'mp4',
'description': 'md5:68d87ef760660eb564455eb30ca464fe',
- 'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
+ 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -78,11 +88,15 @@ class PBSIE(InfoExtractor):
'display_id': 'killer-typhoon',
'ext': 'mp4',
'description': 'md5:c741d14e979fc53228c575894094f157',
- 'title': 'Killer Typhoon',
+ 'title': 'NOVA - Killer Typhoon',
'duration': 3172,
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122',
- }
+ 'age_limit': 10,
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -90,6 +104,36 @@ class PBSIE(InfoExtractor):
'id': 'united-states-of-secrets',
},
'playlist_count': 2,
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
+ 'info_dict': {
+ 'id': '2280706814',
+ 'display_id': 'player',
+ 'ext': 'mp4',
+ 'title': 'American Experience - Death and the Civil War',
+ 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
+ 'duration': 6705,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
+ 'url': 'http://video.pbs.org/video/2365367186/',
+ 'info_dict': {
+ 'id': '2365367186',
+ 'display_id': '2365367186',
+ 'ext': 'mp4',
+ 'title': 'To Catch A Comet - Full Episode',
+ 'description': 'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.',
+ 'duration': 3342,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}
]
@@ -123,7 +167,7 @@ class PBSIE(InfoExtractor):
return media_id, presumptive_id, upload_date
url = self._search_regex(
- r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
+ r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
webpage, 'player URL')
mobj = re.match(self._VALID_URL, url)
@@ -187,6 +231,7 @@ class PBSIE(InfoExtractor):
else:
formats.append({
'url': format_url,
+ 'format_id': redirect.get('eeid'),
})
self._sort_formats(formats)
@@ -195,6 +240,20 @@ class PBSIE(InfoExtractor):
rating_str = rating_str.rpartition('-')[2]
age_limit = US_RATINGS.get(rating_str)
+ subtitles = {}
+ closed_captions_url = info.get('closed_captions_url')
+ if closed_captions_url:
+ subtitles['en'] = [{
+ 'ext': 'ttml',
+ 'url': closed_captions_url,
+ }]
+
+ # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
+ # Try turning it to 'program - title' naming scheme if possible
+ alt_title = info.get('program', {}).get('title')
+ if alt_title:
+ info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
+
return {
'id': video_id,
'display_id': display_id,
@@ -205,4 +264,5 @@ class PBSIE(InfoExtractor):
'age_limit': age_limit,
'upload_date': upload_date,
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
new file mode 100644
index 000000000..8ad936758
--- /dev/null
+++ b/youtube_dl/extractor/periscope.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import parse_iso8601
+
+
+class PeriscopeIE(InfoExtractor):
+ IE_DESC = 'Periscope'
+ _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
+ 'md5': '65b57957972e503fcbbaeed8f4fa04ca',
+ 'info_dict': {
+ 'id': '56102209',
+ 'ext': 'mp4',
+ 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗',
+ 'timestamp': 1438978559,
+ 'upload_date': '20150807',
+ 'uploader': 'Bec Boop',
+ 'uploader_id': '1465763',
+ },
+ 'skip': 'Expires in 24 hours',
+ }
+
+ def _call_api(self, method, token):
+ return self._download_json(
+ 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token)
+
+ def _real_extract(self, url):
+ token = self._match_id(url)
+
+ broadcast_data = self._call_api('getBroadcastPublic', token)
+ broadcast = broadcast_data['broadcast']
+ status = broadcast['status']
+
+ uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
+ uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
+
+ title = '%s - %s' % (uploader, status) if uploader else status
+ state = broadcast.get('state').lower()
+ if state == 'running':
+ title = self._live_title(title)
+ timestamp = parse_iso8601(broadcast.get('created_at'))
+
+ thumbnails = [{
+ 'url': broadcast[image],
+ } for image in ('image_url', 'image_url_small') if broadcast.get(image)]
+
+ stream = self._call_api('getAccessPublic', token)
+
+ formats = []
+ for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
+ video_url = stream.get(format_id + '_url')
+ if not video_url:
+ continue
+ f = {
+ 'url': video_url,
+ 'ext': 'flv' if format_id == 'rtmp' else 'mp4',
+ }
+ if format_id != 'rtmp':
+ f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8'
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': broadcast.get('id') or token,
+ 'title': title,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
+
+
+class QuickscopeIE(InfoExtractor):
+ IE_DESC = 'Quick Scope'
+ _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://watchonperiscope.com/broadcast/56180087',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ broadcast_id = self._match_id(url)
+ request = compat_urllib_request.Request(
+ 'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({
+ 'broadcast_id': broadcast_id,
+ 'entry_ticket': '',
+ 'from_push': 'false',
+ 'uses_sessions': 'true',
+ }).encode('utf-8'))
+ return self.url_result(
+ self._download_json(request, broadcast_id)['share_url'], 'Periscope')
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py
index c66db3cdc..788411ccc 100644
--- a/youtube_dl/extractor/photobucket.py
+++ b/youtube_dl/extractor/photobucket.py
@@ -4,7 +4,7 @@ import json
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
class PhotobucketIE(InfoExtractor):
@@ -34,7 +34,7 @@ class PhotobucketIE(InfoExtractor):
info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
webpage, 'info json')
info = json.loads(info_json)
- url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+ url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
return {
'id': video_id,
'url': url,
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py
new file mode 100644
index 000000000..a52210fab
--- /dev/null
+++ b/youtube_dl/extractor/pinkbike.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ remove_end,
+ remove_start,
+ str_to_int,
+ unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.pinkbike.com/video/402811/',
+ 'md5': '4814b8ca7651034cd87e3361d5c2155a',
+ 'info_dict': {
+ 'id': '402811',
+ 'ext': 'mp4',
+ 'title': 'Brandon Semenuk - RAW 100',
+ 'description': 'Official release: www.redbull.ca/rupertwalker',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 100,
+ 'upload_date': '20150406',
+ 'uploader': 'revelco',
+ 'location': 'Victoria, British Columbia, Canada',
+ 'view_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+ formats = []
+ for _, format_id, src in re.findall(
+ r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append({
+ 'url': src,
+ 'format_id': format_id,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+ description = self._html_search_regex(
+ r'(?s)id="media-description"[^>]*>(.+?)<',
+ webpage, 'description', default=None) or remove_start(
+ self._og_search_description(webpage), title + '. ')
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ uploader = self._search_regex(
+ r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._search_regex(
+ r'class="fullTime"[^>]+title="([^"]+)"',
+ webpage, 'upload date', fatal=False))
+
+ location = self._html_search_regex(
+ r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+ webpage, 'location', fatal=False)
+
+ def extract_count(webpage, label):
+ return str_to_int(self._search_regex(
+ r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+ webpage, label, fatal=False))
+
+ view_count = extract_count(webpage, 'Views')
+ comment_count = extract_count(webpage, 'Comments')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'location': location,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py
index 596c621d7..06505e96f 100644
--- a/youtube_dl/extractor/planetaplay.py
+++ b/youtube_dl/extractor/planetaplay.py
@@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor):
'id': '3586',
'ext': 'flv',
'title': 'md5:e829428ee28b1deed00de90de49d1da1',
- }
+ },
+ 'skip': 'Not accessible from Travis CI server',
}
_SONG_FORMATS = {
diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py
index 45716c75d..8a1c296dd 100644
--- a/youtube_dl/extractor/played.py
+++ b/youtube_dl/extractor/played.py
@@ -38,9 +38,7 @@ class PlayedIE(InfoExtractor):
if m_error:
raise ExtractorError(m_error.group('msg'), expected=True)
- fields = re.findall(
- r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
- data = dict(fields)
+ data = self._hidden_inputs(orig_webpage)
self._sleep(2, video_id)
diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py
new file mode 100644
index 000000000..e360404f7
--- /dev/null
+++ b/youtube_dl/extractor/playtvak.py
@@ -0,0 +1,181 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+ compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ qualities,
+)
+
+
+class PlaytvakIE(InfoExtractor):
+ IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz'
+ _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)'
+ _TESTS = [{
+ 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko',
+ 'md5': '4525ae312c324b4be2f4603cc78ceb4a',
+ 'info_dict': {
+ 'id': 'A150730_150323_hodinovy-manzel_kuko',
+ 'ext': 'mp4',
+ 'title': 'Vyžeňte vosy a sršně ze zahrady',
+ 'description': 'md5:f93d398691044d303bc4a3de62f3e976',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'duration': 279,
+ 'timestamp': 1438732860,
+ 'upload_date': '20150805',
+ 'is_live': False,
+ }
+ }, { # live video test
+ 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat',
+ 'info_dict': {
+ 'id': 'A150624_164934_planespotting_cat',
+ 'ext': 'flv',
+ 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ }, { # idnes.cz
+ 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku',
+ 'md5': '819832ba33cd7016e58a6658577fe289',
+ 'info_dict': {
+ 'id': 'A150809_104116_domaci_pku',
+ 'ext': 'mp4',
+ 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se',
+ 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'duration': 39,
+ 'timestamp': 1438969140,
+ 'upload_date': '20150807',
+ 'is_live': False,
+ }
+ }, { # lidovky.cz
+ 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE',
+ 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8',
+ 'info_dict': {
+ 'id': 'A150808_214044_ln-video_ELE',
+ 'ext': 'mp4',
+ 'title': 'Táhni! Demonstrace proti imigrantům budila emoce',
+ 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1439052180,
+ 'upload_date': '20150808',
+ 'is_live': False,
+ }
+ }, { # metro.cz
+ 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row',
+ 'md5': '84fc1deedcac37b7d4a6ccae7c716668',
+ 'info_dict': {
+ 'id': 'A141111_173251_metro-extra_row',
+ 'ext': 'mp4',
+ 'title': 'Recesisté udělali z billboardu kolotoč',
+ 'description': 'md5:7369926049588c3989a66c9c1a043c4c',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1415725500,
+ 'upload_date': '20141111',
+ 'is_live': False,
+ }
+ }, {
+ 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ info_url = self._html_search_regex(
+ r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url')
+
+ parsed_url = compat_urlparse.urlparse(info_url)
+
+ qs = compat_urlparse.parse_qs(parsed_url.query)
+ qs.update({
+ 'reklama': ['0'],
+ 'type': ['js'],
+ })
+
+ info_url = compat_urlparse.urlunparse(
+ parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
+ json_info = self._download_json(
+ info_url, video_id,
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+ item = None
+ for i in json_info['items']:
+ if i.get('type') == 'video' or i.get('type') == 'stream':
+ item = i
+ break
+ if not item:
+ raise ExtractorError('No suitable stream found')
+
+ quality = qualities(('low', 'middle', 'high'))
+
+ formats = []
+ for fmt in item['video']:
+ video_url = fmt.get('file')
+ if not video_url:
+ continue
+
+ format_ = fmt['format']
+ format_id = '%s_%s' % (format_, fmt['quality'])
+ preference = None
+
+ if format_ in ('mp4', 'webm'):
+ ext = format_
+ elif format_ == 'rtmp':
+ ext = 'flv'
+ elif format_ == 'apple':
+ ext = 'mp4'
+ # Some streams have mp3 audio which does not play
+ # well with ffmpeg filter aac_adtstoasc
+ preference = -1
+ elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests
+ continue
+ else: # Other formats not supported yet
+ continue
+
+ formats.append({
+ 'url': video_url,
+ 'ext': ext,
+ 'format_id': format_id,
+ 'quality': quality(fmt.get('quality')),
+ 'preference': preference,
+ })
+ self._sort_formats(formats)
+
+ title = item['title']
+ is_live = item['type'] == 'stream'
+ if is_live:
+ title = self._live_title(title)
+ description = self._og_search_description(webpage, default=None) or self._html_search_meta(
+ 'description', webpage, 'description')
+ timestamp = None
+ duration = None
+ if not is_live:
+ duration = int_or_none(item.get('length'))
+ timestamp = item.get('published')
+ if timestamp:
+ timestamp = parse_iso8601(timestamp[:-5])
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': item.get('image'),
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'is_live': is_live,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py
index c3e667e9e..2eb4fd96d 100644
--- a/youtube_dl/extractor/playvid.py
+++ b/youtube_dl/extractor/playvid.py
@@ -4,7 +4,8 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
)
from ..utils import (
clean_html,
@@ -44,7 +45,7 @@ class PlayvidIE(InfoExtractor):
flashvars = self._html_search_regex(
r'flashvars="(.+?)"', webpage, 'flashvars')
- infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+ infos = compat_urllib_parse_unquote(flashvars).split(r'&')
for info in infos:
videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
if videovars_match:
@@ -52,7 +53,7 @@ class PlayvidIE(InfoExtractor):
val = videovars_match.group(2)
if key == 'title':
- video_title = compat_urllib_parse.unquote_plus(val)
+ video_title = compat_urllib_parse_unquote_plus(val)
if key == 'duration':
try:
duration = int(val)
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
new file mode 100644
index 000000000..fd32836cc
--- /dev/null
+++ b/youtube_dl/extractor/pluralsight.py
@@ -0,0 +1,207 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_request,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_duration,
+)
+
+
+class PluralsightIE(InfoExtractor):
+ IE_NAME = 'pluralsight'
+ _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/training/player\?author=(?P<author>[^&]+)&name=(?P<name>[^&]+)(?:&mode=live)?&clip=(?P<clip>\d+)&course=(?P<course>[^&]+)'
+ _LOGIN_URL = 'https://www.pluralsight.com/id/'
+ _NETRC_MACHINE = 'pluralsight'
+
+ _TEST = {
+ 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas',
+ 'md5': '4d458cf5cf4c593788672419a8dd4cf8',
+ 'info_dict': {
+ 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',
+ 'ext': 'mp4',
+ 'title': 'Management of SQL Server - Demo Monitoring',
+ 'duration': 338,
+ },
+ 'skip': 'Requires pluralsight account credentials',
+ }
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ self.raise_login_required('Pluralsight account is required')
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'Username': username.encode('utf-8'),
+ 'Password': password.encode('utf-8'),
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post url', default=self._LOGIN_URL, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+ request = compat_urllib_request.Request(
+ post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ error = self._search_regex(
+ r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ author = mobj.group('author')
+ name = mobj.group('name')
+ clip_id = mobj.group('clip')
+ course = mobj.group('course')
+
+ display_id = '%s-%s' % (name, clip_id)
+
+ webpage = self._download_webpage(url, display_id)
+
+ collection = self._parse_json(
+ self._search_regex(
+ r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)',
+ webpage, 'modules'),
+ display_id)
+
+ module, clip = None, None
+
+ for module_ in collection:
+ if module_.get('moduleName') == name:
+ module = module_
+ for clip_ in module_.get('clips', []):
+ clip_index = clip_.get('clipIndex')
+ if clip_index is None:
+ continue
+ if compat_str(clip_index) == clip_id:
+ clip = clip_
+ break
+
+ if not clip:
+ raise ExtractorError('Unable to resolve clip')
+
+ QUALITIES = {
+ 'low': {'width': 640, 'height': 480},
+ 'medium': {'width': 848, 'height': 640},
+ 'high': {'width': 1024, 'height': 768},
+ }
+
+ ALLOWED_QUALITIES = (
+ ('webm', ('high',)),
+ ('mp4', ('low', 'medium', 'high',)),
+ )
+
+ formats = []
+ for ext, qualities in ALLOWED_QUALITIES:
+ for quality in qualities:
+ f = QUALITIES[quality].copy()
+ clip_post = {
+ 'a': author,
+ 'cap': 'false',
+ 'cn': clip_id,
+ 'course': course,
+ 'lc': 'en',
+ 'm': name,
+ 'mt': ext,
+ 'q': '%dx%d' % (f['width'], f['height']),
+ }
+ request = compat_urllib_request.Request(
+ 'http://www.pluralsight.com/training/Player/ViewClip',
+ json.dumps(clip_post).encode('utf-8'))
+ request.add_header('Content-Type', 'application/json;charset=utf-8')
+ format_id = '%s-%s' % (ext, quality)
+ clip_url = self._download_webpage(
+ request, display_id, 'Downloading %s URL' % format_id, fatal=False)
+ if not clip_url:
+ continue
+ f.update({
+ 'url': clip_url,
+ 'ext': ext,
+ 'format_id': format_id,
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ # TODO: captions
+ # http://www.pluralsight.com/training/Player/ViewClip + cap = true
+ # or
+ # http://www.pluralsight.com/training/Player/Captions
+ # { a = author, cn = clip_id, lc = end, m = name }
+
+ return {
+ 'id': clip['clipName'],
+ 'title': '%s - %s' % (module['title'], clip['title']),
+ 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')),
+ 'creator': author,
+ 'formats': formats
+ }
+
+
+class PluralsightCourseIE(InfoExtractor):
+ IE_NAME = 'pluralsight:course'
+ _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/courses/(?P<id>[^/]+)'
+ _TEST = {
+ # Free course from Pluralsight Starter Subscription for Microsoft TechNet
+ # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz
+ 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas',
+ 'info_dict': {
+ 'id': 'hosting-sql-server-windows-azure-iaas',
+ 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals',
+ 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986',
+ },
+ 'playlist_count': 31,
+ }
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ # TODO: PSM cookie
+
+ course = self._download_json(
+ 'http://www.pluralsight.com/data/course/%s' % course_id,
+ course_id, 'Downloading course JSON')
+
+ title = course['title']
+ description = course.get('description') or course.get('shortDescription')
+
+ course_data = self._download_json(
+ 'http://www.pluralsight.com/data/course/content/%s' % course_id,
+ course_id, 'Downloading course data JSON')
+
+ entries = []
+ for module in course_data:
+ for clip in module.get('clips', []):
+ player_parameters = clip.get('playerParameters')
+ if not player_parameters:
+ continue
+ entries.append(self.url_result(
+ 'http://www.pluralsight.com/training/player?%s' % player_parameters,
+ 'Pluralsight'))
+
+ return self.playlist_result(entries, course_id, title, description)
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644
index 000000000..3e15533e9
--- /dev/null
+++ b/youtube_dl/extractor/porn91.py
@@ -0,0 +1,73 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+ ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+ IE_NAME = '91porn'
+ _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+ _TEST = {
+ 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+ 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'info_dict': {
+ 'id': '7e42283b4f5ab36da134',
+ 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+ 'ext': 'mp4',
+ 'duration': 431,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+ self._set_cookie('91porn.com', 'language', 'cn_CN')
+ webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+ if '作为游客,你每天只可观看10个视频' in webpage:
+ raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+ title = self._search_regex(
+ r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+ title = title.replace('\n', '')
+
+ # get real url
+ file_id = self._search_regex(
+ r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+ sec_code = self._search_regex(
+ r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+ max_vid = self._search_regex(
+ r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+ url_params = compat_urllib_parse.urlencode({
+ 'VID': file_id,
+ 'mp4': '1',
+ 'seccode': sec_code,
+ 'max_vid': max_vid,
+ })
+ info_cn = self._download_webpage(
+ 'http://91porn.com/getfile.php?' + url_params, video_id,
+ 'get real video url')
+ video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+ duration = parse_duration(self._search_regex(
+ r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+ comment_count = int_or_none(self._search_regex(
+ r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'comment_count': comment_count,
+ 'age_limit': self._rta_search(webpage),
+ }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 0c8b731cf..7b0cdc41a 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -5,7 +5,8 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlparse,
compat_urllib_request,
)
@@ -19,8 +20,8 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+ _TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': '882f488fa1f0026f023f33576004a2ed',
'info_dict': {
@@ -30,7 +31,17 @@ class PornHubIE(InfoExtractor):
"title": "Seductive Indian beauty strips down and fingers her pink pussy",
"age_limit": 18
}
- }
+ }, {
+ 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
def _extract_count(self, pattern, webpage, name):
return str_to_int(self._search_regex(
@@ -39,7 +50,8 @@ class PornHubIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = compat_urllib_request.Request(url)
+ req = compat_urllib_request.Request(
+ 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
@@ -58,7 +70,7 @@ class PornHubIE(InfoExtractor):
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
- thumbnail = compat_urllib_parse.unquote(thumbnail)
+ thumbnail = compat_urllib_parse_unquote(thumbnail)
view_count = self._extract_count(
r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
@@ -69,9 +81,10 @@ class PornHubIE(InfoExtractor):
comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
- video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+ video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))
if webpage.find('"encrypted":true') != -1:
- password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+ password = compat_urllib_parse_unquote_plus(
+ self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
formats = []
@@ -81,7 +94,7 @@ class PornHubIE(InfoExtractor):
format = path.split('/')[5].split('_')[:2]
format = "-".join(format)
- m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
+ m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
if m is None:
height = None
tbr = None
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py
index 9688ed948..eba4dfbb3 100644
--- a/youtube_dl/extractor/pornovoisines.py
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):
'duration': 120,
'view_count': int,
'average_rating': float,
- 'categories': ['Débutante', 'Scénario', 'Sodomie'],
+ 'categories': ['Débutantes', 'Scénario', 'Sodomie'],
'age_limit': 18,
}
}
@@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):
view_count = int_or_none(self._search_regex(
r'(\d+) vues', webpage, 'view count', fatal=False))
average_rating = self._search_regex(
- r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False)
+ r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
if average_rating:
average_rating = float_or_none(average_rating.replace(',', '.'))
diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py
index 01cc3d9ea..304359dc5 100644
--- a/youtube_dl/extractor/primesharetv.py
+++ b/youtube_dl/extractor/primesharetv.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
@@ -31,12 +29,7 @@ class PrimeShareTVIE(InfoExtractor):
if '>File not exist<' in webpage:
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- (?:id="[^"]+"\s+)?
- value="([^"]*)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
headers = {
'Referer': url,
diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py
index f536e6e6c..8190ed676 100644
--- a/youtube_dl/extractor/promptfile.py
+++ b/youtube_dl/extractor/promptfile.py
@@ -35,10 +35,7 @@ class PromptFileIE(InfoExtractor):
raise ExtractorError('Video %s does not exist' % video_id,
expected=True)
- fields = dict(re.findall(r'''(?x)type="hidden"\s+
- name="(.+?)"\s+
- value="(.*?)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
post = compat_urllib_parse.urlencode(fields)
req = compat_urllib_request.Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 7cc799664..effcf1db3 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -9,18 +9,26 @@ from ..compat import (
compat_urllib_parse,
)
from ..utils import (
- unified_strdate,
+ ExtractorError,
+ determine_ext,
+ float_or_none,
int_or_none,
+ unified_strdate,
)
class ProSiebenSat1IE(InfoExtractor):
IE_NAME = 'prosiebensat1'
IE_DESC = 'ProSiebenSat.1 Digital'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
_TESTS = [
{
+ # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242
+ # in response to fixing https://github.com/rg3/youtube-dl/issues/6215:
+ # - malformed f4m manifest support
+ # - proper handling of URLs starting with `https?://` in 2.0 manifests
+ # - recursive child f4m manifests extraction
'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
'info_dict': {
'id': '2104602',
@@ -177,6 +185,7 @@ class ProSiebenSat1IE(InfoExtractor):
r'<header class="clearfix">\s*<h3>(.+?)</h3>',
r'<!-- start video -->\s*<h1>(.+?)</h1>',
r'<h1 class="att-name">\s*(.+?)</h1>',
+ r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
@@ -206,8 +215,8 @@ class ProSiebenSat1IE(InfoExtractor):
def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
- access_token = 'testclient'
- client_name = 'kolibri-1.2.5'
+ access_token = 'prosieben'
+ client_name = 'kolibri-2.0.19-splec4'
client_location = url
videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -217,10 +226,13 @@ class ProSiebenSat1IE(InfoExtractor):
'ids': clip_id,
})
- videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+ video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0]
+
+ if video.get('is_protected') is True:
+ raise ExtractorError('This video is DRM protected.', expected=True)
- duration = float(videos[0]['duration'])
- source_ids = [source['id'] for source in videos[0]['sources']]
+ duration = float_or_none(video.get('duration'))
+ source_ids = [source['id'] for source in video['sources']]
source_ids_str = ','.join(map(str, source_ids))
g = '01!8d8F_)r9]4s[qeuXfP%'
@@ -274,23 +286,30 @@ class ProSiebenSat1IE(InfoExtractor):
for source in urls_sources:
protocol = source['protocol']
+ source_url = source['url']
if protocol == 'rtmp' or protocol == 'rtmpe':
- mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+ mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
if not mobj:
continue
+ path = mobj.group('path')
+ mp4colon_index = path.rfind('mp4:')
+ app = path[:mp4colon_index]
+ play_path = path[mp4colon_index:]
formats.append({
- 'url': mobj.group('url'),
- 'app': mobj.group('app'),
- 'play_path': mobj.group('playpath'),
+ 'url': '%s/%s' % (mobj.group('url'), app),
+ 'app': app,
+ 'play_path': play_path,
'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
'page_url': 'http://www.prosieben.de',
'vbr': fix_bitrate(source['bitrate']),
'ext': 'mp4',
'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
})
+ elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(source_url, clip_id))
else:
formats.append({
- 'url': source['url'],
+ 'url': source_url,
'vbr': fix_bitrate(source['bitrate']),
})
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 174c8e0ae..1654a641f 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -9,25 +9,48 @@ from .common import InfoExtractor
from ..utils import (
strip_jsonp,
unescapeHTML,
+ clean_html,
)
from ..compat import compat_urllib_request
class QQMusicIE(InfoExtractor):
+ IE_NAME = 'qqmusic'
+ IE_DESC = 'QQ音乐'
_VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
- 'md5': 'bed90b6db2a7a7a7e11bc585f471f63a',
+ 'md5': '9ce1c1c8445f561506d2e3cfb0255705',
'info_dict': {
'id': '004295Et37taLD',
- 'ext': 'm4a',
+ 'ext': 'mp3',
'title': '可惜没如果',
'upload_date': '20141227',
'creator': '林俊杰',
- 'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',
+ 'description': 'md5:d327722d0361576fde558f1ac68a7065',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'note': 'There is no mp3-320 version of this song.',
+ 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+ 'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
+ 'info_dict': {
+ 'id': '004MsGEo3DdNxV',
+ 'ext': 'mp3',
+ 'title': '如果',
+ 'upload_date': '20050626',
+ 'creator': '李季美',
+ 'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
+ 'thumbnail': 're:^https?://.*\.jpg$',
}
}]
+ _FORMATS = {
+ 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+ 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+ 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+ }
+
# Reference: m_r_GetRUin() in top_player.js
# http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
@staticmethod
@@ -58,6 +81,16 @@ class QQMusicIE(InfoExtractor):
lrc_content = self._html_search_regex(
r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',
detail_info_page, 'LRC lyrics', default=None)
+ if lrc_content:
+ lrc_content = lrc_content.replace('\\n', '\n')
+
+ thumbnail_url = None
+ albummid = self._search_regex(
+ [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
+ detail_info_page, 'album mid', default=None)
+ if albummid:
+ thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+ % (albummid[-2:-1], albummid[-1], albummid)
guid = self.m_r_get_ruin()
@@ -65,15 +98,28 @@ class QQMusicIE(InfoExtractor):
'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
mid, note='Retrieve vkey', errnote='Unable to get vkey',
transform_source=strip_jsonp)['key']
- song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid)
+
+ formats = []
+ for format_id, details in self._FORMATS.items():
+ formats.append({
+ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+ % (details['prefix'], mid, details['ext'], vkey, guid),
+ 'format': format_id,
+ 'format_id': format_id,
+ 'preference': details['preference'],
+ 'abr': details.get('abr'),
+ })
+ self._check_formats(formats, mid)
+ self._sort_formats(formats)
return {
'id': mid,
- 'url': song_url,
+ 'formats': formats,
'title': song_name,
'upload_date': publish_time,
'creator': singer,
'description': lrc_content,
+ 'thumbnail': thumbnail_url,
}
@@ -96,6 +142,8 @@ class QQPlaylistBaseIE(InfoExtractor):
class QQMusicSingerIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:singer'
+ IE_DESC = 'QQ音乐 - 歌手'
_VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
_TEST = {
'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
@@ -139,32 +187,131 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
class QQMusicAlbumIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:album'
+ IE_DESC = 'QQ音乐 - 专辑'
_VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
- _TEST = {
- 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1&play=0',
+ _TESTS = [{
+ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
'info_dict': {
'id': '000gXCTb2AhRR1',
'title': '我们都是这样长大的',
- 'description': 'md5:d216c55a2d4b3537fe4415b8767d74d6',
+ 'description': 'md5:179c5dce203a5931970d306aa9607ea6',
},
'playlist_count': 4,
- }
+ }, {
+ 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+ 'info_dict': {
+ 'id': '002Y5a3b3AlCu3',
+ 'title': '그리고...',
+ 'description': 'md5:a48823755615508a95080e81b51ba729',
+ },
+ 'playlist_count': 8,
+ }]
def _real_extract(self, url):
mid = self._match_id(url)
- album_page = self._download_webpage(
- self.qq_static_url('album', mid), mid, 'Download album page')
+ album = self._download_json(
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid,
+ mid, 'Download album page')['data']
- entries = self.get_entries_from_page(album_page)
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+ ) for song in album['list']
+ ]
+ album_name = album.get('name')
+ album_detail = album.get('desc')
+ if album_detail is not None:
+ album_detail = album_detail.strip()
- album_name = self._html_search_regex(
- r"albumname\s*:\s*'([^']+)',", album_page, 'album name',
- default=None)
+ return self.playlist_result(entries, mid, album_name, album_detail)
- album_detail = self._html_search_regex(
- r'<div class="album_detail close_detail">\s*<p>((?:[^<>]+(?:<br />)?)+)</p>',
- album_page, 'album details', default=None)
- return self.playlist_result(entries, mid, album_name, album_detail)
+class QQMusicToplistIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:toplist'
+ IE_DESC = 'QQ音乐 - 排行榜'
+ _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+
+ _TESTS = [{
+ 'url': 'http://y.qq.com/#type=toplist&p=global_123',
+ 'info_dict': {
+ 'id': 'global_123',
+ 'title': '美国iTunes榜',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'http://y.qq.com/#type=toplist&p=top_3',
+ 'info_dict': {
+ 'id': 'top_3',
+ 'title': 'QQ音乐巅峰榜·欧美',
+ 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+ '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+ '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+ '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+ },
+ 'playlist_count': 100,
+ }, {
+ 'url': 'http://y.qq.com/#type=toplist&p=global_106',
+ 'info_dict': {
+ 'id': 'global_106',
+ 'title': '韩国Mnet榜',
+ },
+ 'playlist_count': 50,
+ }]
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ list_type, num_id = list_id.split("_")
+
+ toplist_json = self._download_json(
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+ % (list_type, num_id),
+ list_id, 'Download toplist page')
+
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+ ) for song in toplist_json['songlist']
+ ]
+
+ topinfo = toplist_json.get('topinfo', {})
+ list_name = topinfo.get('ListName')
+ list_description = topinfo.get('info')
+ return self.playlist_result(entries, list_id, list_name, list_description)
+
+
+class QQMusicPlaylistIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:playlist'
+ IE_DESC = 'QQ音乐 - 歌单'
+ _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+ 'info_dict': {
+ 'id': '3462654915',
+ 'title': '韩国5月新歌精选下旬',
+ 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4',
+ },
+ 'playlist_count': 40,
+ }
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ list_json = self._download_json(
+ 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
+ % list_id, list_id, 'Download list page',
+ transform_source=strip_jsonp)['cdlist'][0]
+
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+ ) for song in list_json['songlist']
+ ]
+
+ list_name = list_json.get('dissname')
+ list_description = clean_html(unescapeHTML(list_json.get('desc')))
+ return self.playlist_result(entries, list_id, list_name, list_description)
diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py
index af7d76cf4..f414e2384 100644
--- a/youtube_dl/extractor/quickvid.py
+++ b/youtube_dl/extractor/quickvid.py
@@ -24,6 +24,7 @@ class QuickVidIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',
'view_count': int,
},
+ 'skip': 'Not accessible from Travis CI server',
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py
new file mode 100644
index 000000000..796adfdf9
--- /dev/null
+++ b/youtube_dl/extractor/rds.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_iso8601,
+)
+
+
+class RDSIE(InfoExtractor):
+ IE_DESC = 'RDS.ca'
+ _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+
+ _TESTS = [{
+ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
+ 'info_dict': {
+ 'id': '3.1132799',
+ 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
+ 'ext': 'mp4',
+ 'title': 'Fowler Jr. prend la direction de Jacksonville',
+ 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ',
+ 'timestamp': 1430397346,
+ 'upload_date': '20150430',
+ 'duration': 154.354,
+ 'age_limit': 0,
+ }
+ }, {
+ 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ # TODO: extract f4m from 9c9media.com
+ video_url = self._search_regex(
+ r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
+ webpage, 'video url')
+
+ title = self._og_search_title(webpage) or self._html_search_meta(
+ 'title', webpage, 'title', fatal=True)
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+ [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
+ r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
+ webpage, 'thumbnail', fatal=False)
+ timestamp = parse_iso8601(self._search_regex(
+ r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"',
+ webpage, 'upload date', fatal=False))
+ duration = parse_duration(self._search_regex(
+ r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"',
+ webpage, 'duration', fatal=False))
+ age_limit = self._family_friendly_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
index dce64e151..e4215d546 100644
--- a/youtube_dl/extractor/rtbf.py
+++ b/youtube_dl/extractor/rtbf.py
@@ -1,10 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unescapeHTML,
+)
class RTBFIE(InfoExtractor):
@@ -16,34 +17,47 @@ class RTBFIE(InfoExtractor):
'id': '1921274',
'ext': 'mp4',
'title': 'Les Diables au coeur (épisode 2)',
- 'description': 'Football - Diables Rouges',
'duration': 3099,
- 'timestamp': 1398456336,
- 'upload_date': '20140425',
}
}
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ _QUALITIES = [
+ ('mobile', 'mobile'),
+ ('web', 'SD'),
+ ('url', 'MD'),
+ ('high', 'HD'),
+ ]
- page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
- data = json.loads(self._html_search_regex(
- r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+ webpage = self._download_webpage(
+ 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
- video_url = data.get('downloadUrl') or data.get('url')
+ data = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'data-video="([^"]+)"', webpage, 'data video')),
+ video_id)
- if data['provider'].lower() == 'youtube':
+ if data.get('provider').lower() == 'youtube':
+ video_url = data.get('downloadUrl') or data.get('url')
return self.url_result(video_url, 'Youtube')
+ formats = []
+ for key, format_id in self._QUALITIES:
+ format_url = data['sources'].get(key)
+ if format_url:
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ })
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': data['title'],
'description': data.get('description') or data.get('subtitle'),
- 'thumbnail': data['thumbnail']['large'],
+ 'thumbnail': data.get('thumbnail'),
'duration': data.get('duration') or data.get('realDuration'),
- 'timestamp': data['created'],
- 'view_count': data['viewCount'],
+ 'timestamp': int_or_none(data.get('created')),
+ 'view_count': int_or_none(data.get('viewCount')),
}
diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py
index 72cd80498..25f7faf76 100644
--- a/youtube_dl/extractor/rtl2.py
+++ b/youtube_dl/extractor/rtl2.py
@@ -1,6 +1,7 @@
# encoding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
@@ -8,22 +9,28 @@ class RTL2IE(InfoExtractor):
_VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))'
_TESTS = [{
'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',
- 'md5': 'bfcc179030535b08dc2b36b469b5adc7',
'info_dict': {
'id': 'folge-203-0',
'ext': 'f4v',
'title': 'GRIP sucht den Sommerkönig',
'description': 'Matthias, Det und Helge treten gegeneinander an.'
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/',
- 'md5': 'ffcd517d2805b57ce11a58a2980c2b02',
'info_dict': {
'id': '21040-anna-erwischt-alex',
'ext': 'mp4',
'title': 'Anna erwischt Alex!',
'description': 'Anna ist Alex\' Tochter bei Köln 50667.'
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -34,12 +41,18 @@ class RTL2IE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- vico_id = self._html_search_regex(
- r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
- vivi_id = self._html_search_regex(
- r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
+ mobj = re.search(
+ r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"',
+ webpage)
+ if mobj:
+ vico_id = mobj.group('vico_id')
+ vivi_id = mobj.group('vivi_id')
+ else:
+ vico_id = self._html_search_regex(
+ r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
+ vivi_id = self._html_search_regex(
+ r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id
- webpage = self._download_webpage(info_url, '')
info = self._download_json(info_url, video_id)
video_info = info['video']
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index cfce4550a..543d94417 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor):
IE_NAME = 'rtl.nl'
IE_DESC = 'rtl.nl and rtlxl.nl'
_VALID_URL = r'''(?x)
- https?://(www\.)?
+ https?://(?:www\.)?
(?:
rtlxl\.nl/\#!/[^/]+/|
- rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid=
+ rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
)
(?P<id>[0-9a-f-]+)'''
@@ -43,26 +43,60 @@ class RtlNlIE(InfoExtractor):
'upload_date': '20150215',
'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
}
+ }, {
+ # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
+ 'info_dict': {
+ 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
+ 'ext': 'mp4',
+ 'title': 'RTL Nieuws - Meer beelden van overval juwelier',
+ 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+ 'timestamp': 1437233400,
+ 'upload_date': '20150718',
+ 'duration': 30.474,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # encrypted m3u8 streams, georestricted
+ 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+ 'only_matching': True,
}]
def _real_extract(self, url):
uuid = self._match_id(url)
info = self._download_json(
- 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+ 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
uuid)
material = info['material'][0]
- progname = info['abstracts'][0]['name']
- subtitle = material['title'] or info['episodes'][0]['name']
- description = material.get('synopsis') or info['episodes'][0]['synopsis']
+ title = info['abstracts'][0]['name']
+ subtitle = material.get('title')
+ if subtitle:
+ title += ' - %s' % subtitle
+ description = material.get('synopsis')
+
+ meta = info.get('meta', {})
- # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118)
- videopath = material['videopath'].replace('.f4m', '.m3u8')
- m3u8_url = 'http://manifest.us.rtl.nl' + videopath
+ # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv.
+ # To workaround this previously adaptive -> flash trick was used to obtain
+ # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118)
+ # and bypass georestrictions as well.
+ # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore
+ # unusable albeit can be fixed by simple string replacement (see
+ # https://github.com/rg3/youtube-dl/pull/6337)
+ # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted
+ # streams are used now.
+ videopath = material['videopath']
+ m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
- video_urlpart = videopath.split('/flash/')[1][:-5]
+ video_urlpart = videopath.split('/adaptive/')[1][:-5]
PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
formats.extend([
@@ -79,7 +113,7 @@ class RtlNlIE(InfoExtractor):
self._sort_formats(formats)
thumbnails = []
- meta = info.get('meta', {})
+
for p in ('poster_base_url', '"thumb_base_url"'):
if not meta.get(p):
continue
@@ -95,7 +129,7 @@ class RtlNlIE(InfoExtractor):
return {
'id': uuid,
- 'title': '%s - %s' % (progname, subtitle),
+ 'title': title,
'formats': formats,
'timestamp': material['original_date'],
'description': description,
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644
index 785a8045e..000000000
--- a/youtube_dl/extractor/rtlnow.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- clean_html,
- unified_strdate,
- int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
- """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'''(?x)
- (?:https?://)?
- (?P<url>
- (?P<domain>
- rtl-now\.rtl\.de|
- rtl2now\.rtl2\.de|
- (?:www\.)?voxnow\.de|
- (?:www\.)?rtlnitronow\.de|
- (?:www\.)?superrtlnow\.de|
- (?:www\.)?n-tvnow\.de)
- /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
- (?:container_id|film_id)=(?P<video_id>[0-9]+)&
- player=1(?:&season=[0-9]+)?(?:&.*)?
- )'''
-
- _TESTS = [
- {
- 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
- 'info_dict': {
- 'id': '90419',
- 'ext': 'flv',
- 'title': 'Ahornallee - Folge 1 - Der Einzug',
- 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
- 'upload_date': '20070416',
- 'duration': 1685,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
- 'info_dict': {
- 'id': '69756',
- 'ext': 'flv',
- 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
- 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
- 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
- 'upload_date': '20120519',
- 'duration': 1245,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
- 'info_dict': {
- 'id': '13883',
- 'ext': 'flv',
- 'title': 'Voxtours - Südafrika-Reporter II',
- 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
- 'upload_date': '20090627',
- 'duration': 1800,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
- 'info_dict': {
- 'id': '99205',
- 'ext': 'flv',
- 'title': 'Medicopter 117 - Angst!',
- 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
- 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
- 'upload_date': '20080928',
- 'duration': 2691,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
- 'info_dict': {
- 'id': '188729',
- 'ext': 'flv',
- 'upload_date': '20150204',
- 'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
- 'title': 'Der Bachelor - Folge 4',
- }
- }, {
- 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
- 'only_matching': True,
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_page_url = 'http://%s/' % mobj.group('domain')
- video_id = mobj.group('video_id')
-
- webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
- mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
- if mobj:
- raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage, default=None)
-
- upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
- mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
- duration = int(mobj.group('seconds')) if mobj else None
-
- playerdata_url = self._html_search_regex(
- r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
- playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
- videoinfo = playerdata.find('./playlist/videoinfo')
-
- formats = []
- for filename in videoinfo.findall('filename'):
- mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
- if mobj:
- fmt = {
- 'url': mobj.group('url'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': video_page_url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
- if mobj:
- fmt = {
- 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- fmt = {
- 'url': filename.text,
- }
- fmt.update({
- 'width': int_or_none(filename.get('width')),
- 'height': int_or_none(filename.get('height')),
- 'vbr': int_or_none(filename.get('bitrate')),
- 'ext': 'flv',
- })
- formats.append(fmt)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py
index ecf4939cd..82b323cdd 100644
--- a/youtube_dl/extractor/rtp.py
+++ b/youtube_dl/extractor/rtp.py
@@ -18,6 +18,10 @@ class RTPIE(InfoExtractor):
'description': 'As paixões musicais de António Cartaxo e António Macedo',
'thumbnail': 're:^https?://.*\.jpg',
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
'only_matching': True,
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index d0981115d..12639f08b 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -19,7 +19,16 @@ from ..utils import (
class RTSIE(InfoExtractor):
IE_DESC = 'RTS.ch'
- _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+))'
+ _VALID_URL = r'''(?x)
+ (?:
+ rts:(?P<rts_id>\d+)|
+ https?://
+ (?:www\.)?rts\.ch/
+ (?:
+ (?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|
+ play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+)
+ )
+ )'''
_TESTS = [
{
@@ -123,6 +132,15 @@ class RTSIE(InfoExtractor):
},
},
{
+ # article with videos on rhs
+ 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html',
+ 'info_dict': {
+ 'id': '6693917',
+ 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse',
+ },
+ 'playlist_mincount': 5,
+ },
+ {
'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280',
'only_matching': True,
}
@@ -130,7 +148,7 @@ class RTSIE(InfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
- video_id = m.group('id') or m.group('id_new')
+ video_id = m.group('rts_id') or m.group('id') or m.group('id_new')
display_id = m.group('display_id') or m.group('display_id_new')
def download_json(internal_id):
@@ -143,6 +161,15 @@ class RTSIE(InfoExtractor):
# video_id extracted out of URL is not always a real id
if 'video' not in all_info and 'audio' not in all_info:
page = self._download_webpage(url, display_id)
+
+ # article with videos on rhs
+ videos = re.findall(
+ r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:rts:video:(\d+)"',
+ page)
+ if videos:
+ entries = [self.url_result('rts:%s' % video_urn, 'RTS') for video_urn in videos]
+ return self.playlist_result(entries, video_id, self._og_search_title(page))
+
internal_id = self._html_search_regex(
r'<(?:video|audio) data-id="([0-9]+)"', page,
'internal video id')
@@ -190,6 +217,7 @@ class RTSIE(InfoExtractor):
'tbr': media['rate'] or extract_bitrate(media['url']),
} for media in info['media'] if media.get('rate')])
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 849300140..82cd98ac7 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -17,7 +17,7 @@ from ..utils import (
def _decrypt_url(png):
- encrypted_data = base64.b64decode(png)
+ encrypted_data = base64.b64decode(png.encode('utf-8'))
text_index = encrypted_data.find(b'tEXt')
text_chunk = encrypted_data[text_index - 4:]
length = struct_unpack('!I', text_chunk[:4])[0]
diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py
new file mode 100644
index 000000000..7c9d4b0cd
--- /dev/null
+++ b/youtube_dl/extractor/rtvnh.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class RTVNHIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.rtvnh.nl/video/131946',
+ 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1',
+ 'info_dict': {
+ 'id': '131946',
+ 'ext': 'mp4',
+ 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw',
+ 'thumbnail': 're:^https?:.*\.jpg$'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ meta = self._parse_json(self._download_webpage(
+ 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id)
+
+ status = meta.get('status')
+ if status != 200:
+ raise ExtractorError(
+ '%s returned error code %d' % (self.IE_NAME, status), expected=True)
+
+ formats = self._extract_smil_formats(
+ 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False)
+
+ for item in meta['source']['fb']:
+ if item.get('type') == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ item['file'], video_id, ext='mp4', entry_protocol='m3u8_native'))
+ elif item.get('type') == '':
+ formats.append({'url': item['file']})
+
+ return {
+ 'id': video_id,
+ 'title': meta['title'].strip(),
+ 'thumbnail': meta.get('image'),
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 5b1c3577a..d94dc7399 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -30,6 +30,7 @@ class RutubeIE(InfoExtractor):
'uploader': 'NTDRussian',
'uploader_id': '29790',
'upload_date': '20131016',
+ 'age_limit': 0,
},
'params': {
# It requires ffmpeg (m3u8 download)
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
index 1ec2c86e5..d9df06861 100644
--- a/youtube_dl/extractor/rutv.py
+++ b/youtube_dl/extractor/rutv.py
@@ -87,7 +87,7 @@ class RUTVIE(InfoExtractor):
'skip': 'Translation has finished',
},
{
- 'url': 'http://live.russia.tv/index/index/channel_id/3',
+ 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/',
'info_dict': {
'id': '21',
'ext': 'mp4',
@@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
if mobj:
return mobj.group('url')
@@ -128,8 +128,10 @@ class RUTVIE(InfoExtractor):
elif video_path.startswith('index/iframe/cast_id'):
video_type = 'live'
+ is_live = video_type == 'live'
+
json_data = self._download_json(
- 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
+ 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if is_live else '', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
@@ -156,6 +158,7 @@ class RUTVIE(InfoExtractor):
for transport, links in media['sources'].items():
for quality, url in links.items():
+ preference = -1 if priority_transport == transport else -2
if transport == 'rtmp':
mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
if not mobj:
@@ -169,9 +172,11 @@ class RUTVIE(InfoExtractor):
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
+ 'preference': preference,
}
elif transport == 'm3u8':
- formats.extend(self._extract_m3u8_formats(url, video_id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(
+ url, video_id, 'mp4', preference=preference, m3u8_id='hls'))
continue
else:
fmt = {
@@ -181,17 +186,11 @@ class RUTVIE(InfoExtractor):
'width': width,
'height': height,
'format_id': '%s-%s' % (transport, quality),
- 'preference': -1 if priority_transport == transport else -2,
})
formats.append(fmt)
- if not formats:
- raise ExtractorError('No media links available for %s' % video_id)
-
self._sort_formats(formats)
- is_live = video_type == 'live'
-
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
new file mode 100644
index 000000000..4e22628d0
--- /dev/null
+++ b/youtube_dl/extractor/ruutu.py
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+ 'md5': 'ab2093f39be1ca8581963451b3c0234f',
+ 'info_dict': {
+ 'id': '2058907',
+ 'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+ 'ext': 'mp4',
+ 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+ 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 114,
+ 'age_limit': 0,
+ },
+ },
+ {
+ 'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa',
+ 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+ 'info_dict': {
+ 'id': '2057306',
+ 'display_id': 'superpesis-katso-koko-kausi-ruudussa',
+ 'ext': 'mp4',
+ 'title': 'Superpesis: katso koko kausi Ruudussa',
+ 'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 40,
+ 'age_limit': 0,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r'data-media-id="(\d+)"', webpage, 'media id')
+
+ video_xml_url = None
+
+ media_data = self._search_regex(
+ r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage,
+ 'media data', default=None)
+ if media_data:
+ media_json = self._parse_json(media_data, display_id, fatal=False)
+ if media_json:
+ xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl')
+ if xml_url:
+ video_xml_url = xml_url.replace('{ID}', video_id)
+
+ if not video_xml_url:
+ video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id
+
+ video_xml = self._download_xml(video_xml_url, video_id)
+
+ formats = []
+ processed_urls = []
+
+ def extract_formats(node):
+ for child in node:
+ if child.tag.endswith('Files'):
+ extract_formats(child)
+ elif child.tag.endswith('File'):
+ video_url = child.text
+ if not video_url or video_url in processed_urls or 'NOT_USED' in video_url:
+ return
+ processed_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls'))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds'))
+ else:
+ proto = compat_urllib_parse_urlparse(video_url).scheme
+ if not child.tag.startswith('HTTP') and proto != 'rtmp':
+ continue
+ preference = -1 if proto == 'rtmp' else 1
+ label = child.get('label')
+ tbr = int_or_none(child.get('bitrate'))
+ width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+ formats.append({
+ 'format_id': '%s-%s' % (proto, label if label else tbr),
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'preference': preference,
+ })
+
+ extract_formats(video_xml.find('./Clip'))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+ 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
index 10251f29e..a602af692 100644
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@@ -20,7 +20,6 @@ from ..utils import (
class SafariBaseIE(InfoExtractor):
_LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/'
_SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
- _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com'
_NETRC_MACHINE = 'safari'
_API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
@@ -37,9 +36,7 @@ class SafariBaseIE(InfoExtractor):
def _login(self):
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError(
- self._ACCOUNT_CREDENTIALS_HINT,
- expected=True)
+ self.raise_login_required('safaribooksonline.com account is required')
headers = std_headers
if 'Referer' not in headers:
@@ -83,7 +80,7 @@ class SafariIE(SafariBaseIE):
library/view/[^/]+|
api/v1/book
)/
- (?P<course_id>\d+)/
+ (?P<course_id>[^/]+)/
(?:chapter(?:-content)?/)?
(?P<part>part\d+)\.html
'''
@@ -100,6 +97,10 @@ class SafariIE(SafariBaseIE):
}, {
'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
'only_matching': True,
+ }, {
+ # non-digits in course id
+ 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -122,7 +123,7 @@ class SafariCourseIE(SafariBaseIE):
IE_NAME = 'safari:course'
IE_DESC = 'safaribooksonline.com online courses'
- _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)'
+ _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
_TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index b8775c2f9..d6ee2d9e2 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -1,18 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import json
-import re
from .common import InfoExtractor
-from ..utils import (
- js_to_json,
- remove_end,
-)
class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
- _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
@@ -22,38 +16,36 @@ class SBSIE(InfoExtractor):
'info_dict': {
'id': '320403011771',
'ext': 'mp4',
- 'title': 'Dingo Conservation',
- 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction',
+ 'title': 'Dingo Conservation (The Feed)',
+ 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
'thumbnail': 're:http://.*\.jpg',
+ 'duration': 308,
},
- 'add_ies': ['generic'],
}, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
'only_matching': True,
+ }, {
+ 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
+ video_id = self._match_id(url)
- release_urls_json = js_to_json(self._search_regex(
- r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n',
- webpage, ''))
- release_urls = json.loads(release_urls_json)
- theplatform_url = (
- release_urls.get('progressive') or release_urls.get('standard'))
+ webpage = self._download_webpage(
+ 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id)
- title = remove_end(self._og_search_title(webpage), ' (The Feed)')
- description = self._html_search_meta('description', webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ player_params = self._parse_json(
+ self._search_regex(
+ r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'),
+ video_id)
+
+ urls = player_params['releaseUrls']
+ theplatform_url = (urls.get('progressive') or urls.get('standard') or
+ urls.get('html') or player_params['relatedItemsURL'])
return {
'_type': 'url_transparent',
'id': video_id,
'url': theplatform_url,
-
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
index 74fb1983a..05f93904c 100644
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -7,12 +7,13 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
+ js_to_json,
)
class ScreenwaveMediaIE(InfoExtractor):
- _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
-
+ _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)'
+ EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1'
_TESTS = [{
'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
'only_matching': True,
@@ -20,58 +21,73 @@ class ScreenwaveMediaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
+
+ playerdata = self._download_webpage(
+ 'http://player.screenwavemedia.com/player.php?id=%s' % video_id,
+ video_id, 'Downloading player webpage')
vidtitle = self._search_regex(
r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
- vidurl = self._search_regex(
- r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/')
-
- videolist_url = None
-
- mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
- if mobj:
- videoserver = mobj.group('videoserver')
- mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
- vidid = mobj.group('vidid') if mobj else video_id
- videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
- else:
- mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
- if mobj:
- videolist_url = mobj.group('smil')
-
- if videolist_url:
- videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
- formats = []
- baseurl = vidurl[:vidurl.rfind('/') + 1]
- for video in videolist.findall('.//video'):
- src = video.get('src')
- if not src:
+
+ playerconfig = self._download_webpage(
+ 'http://player.screenwavemedia.com/player.js',
+ video_id, 'Downloading playerconfig webpage')
+
+ videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver')
+
+ sources = self._parse_json(
+ js_to_json(
+ re.sub(
+ r'(?s)/\*.*?\*/', '',
+ self._search_regex(
+ r"sources\s*:\s*(\[[^\]]+?\])", playerconfig,
+ 'sources',
+ ).replace(
+ "' + thisObj.options.videoserver + '",
+ videoserver
+ ).replace(
+ "' + playerVidId + '",
+ video_id
+ )
+ )
+ ),
+ video_id, fatal=False
+ )
+
+ # Fallback to hardcoded sources if JS changes again
+ if not sources:
+ self.report_warning('Falling back to a hardcoded list of streams')
+ sources = [{
+ 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id),
+ 'type': 'mp4',
+ 'label': format_label,
+ } for format_id, format_label in (
+ ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))]
+ sources.append({
+ 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id),
+ 'type': 'hls',
+ })
+
+ formats = []
+ for source in sources:
+ if source['type'] == 'hls':
+ formats.extend(self._extract_m3u8_formats(source['file'], video_id))
+ else:
+ file_ = source.get('file')
+ if not file_:
continue
- file_ = src.partition(':')[-1]
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'), scale=1000)
- format = {
- 'url': baseurl + file_,
- 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
- }
- if width or height:
- format.update({
- 'tbr': bitrate,
- 'width': width,
- 'height': height,
- })
- else:
- format.update({
- 'abr': bitrate,
- 'vcodec': 'none',
- })
- formats.append(format)
- else:
- formats = [{
- 'url': vidurl,
- }]
+ format_label = source.get('label')
+ format_id = self._search_regex(
+ r'_(.+?)\.[^.]+$', file_, 'format id', default=None)
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_label, 'height', default=None))
+ formats.append({
+ 'url': source['file'],
+ 'format_id': format_id,
+ 'format': format_label,
+ 'ext': source.get('type'),
+ 'height': height,
+ })
self._sort_formats(formats)
return {
@@ -99,7 +115,7 @@ class TeamFourIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
playerdata_url = self._search_regex(
- r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
webpage, 'player data URL')
video_title = self._html_search_regex(
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index d3b8a1be4..9c53704ea 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
["arch", "", "http://ussenate-f.akamaihd.net/"]
]
_IE_NAME = 'senate.gov'
- _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+ _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
'info_dict': {
@@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):
'ext': 'mp4',
'title': 'Integrated Senate Video Player'
}
+ }, {
+ # From http://www.c-span.org/video/?96791-1
+ 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+ 'only_matching': True,
}]
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
- r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
+ r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py
index 6446d26dc..e33483674 100644
--- a/youtube_dl/extractor/sexykarma.py
+++ b/youtube_dl/extractor/sexykarma.py
@@ -29,6 +29,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int,
'comment_count': int,
'categories': list,
+ 'age_limit': 18,
}
}, {
'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
new file mode 100644
index 000000000..6e9903d5e
--- /dev/null
+++ b/youtube_dl/extractor/shahid.py
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class ShahidIE(InfoExtractor):
+ _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?'
+ _TESTS = [{
+ 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html',
+ 'info_dict': {
+ 'id': '90574',
+ 'ext': 'm3u8',
+ 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3',
+ 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان',
+ 'duration': 2972,
+ 'timestamp': 1422057420,
+ 'upload_date': '20150123',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ # shahid plus subscriber only
+ 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html',
+ 'only_matching': True
+ }]
+
+ def _handle_error(self, response):
+ if not isinstance(response, dict):
+ return
+ error = response.get('error')
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())),
+ expected=True)
+
+ def _download_json(self, url, video_id, note='Downloading JSON metadata'):
+ response = super(ShahidIE, self)._download_json(url, video_id, note)['data']
+ self._handle_error(response)
+ return response
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ api_vars = {
+ 'id': video_id,
+ 'type': 'player',
+ 'url': 'http://api.shahid.net/api/v1_1',
+ 'playerType': 'episode',
+ }
+
+ flashvars = self._search_regex(
+ r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None)
+ if flashvars:
+ for key in api_vars.keys():
+ value = self._search_regex(
+ r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key,
+ flashvars, 'type', default=None, group='value')
+ if value:
+ api_vars[key] = value
+
+ player = self._download_json(
+ 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html'
+ % (video_id, api_vars['type']), video_id, 'Downloading player JSON')
+
+ formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4')
+
+ video = self._download_json(
+ '%s/%s/%s?%s' % (
+ api_vars['url'], api_vars['playerType'], api_vars['id'],
+ compat_urllib_parse.urlencode({
+ 'apiKey': 'sh@hid0nlin3',
+ 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
+ }).encode('utf-8')),
+ video_id, 'Downloading video JSON')
+
+ video = video[api_vars['playerType']]
+
+ title = video['title']
+ description = video.get('description')
+ thumbnail = video.get('thumbnailUrl')
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('referenceDate'))
+ categories = [
+ category['name']
+ for category in video.get('genres', []) if 'name' in category]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'categories': categories,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index 26ced716e..c5636e8e9 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
import base64
from .common import InfoExtractor
@@ -15,17 +14,28 @@ from ..utils import (
class SharedIE(InfoExtractor):
- _VALID_URL = r'http://shared\.sx/(?P<id>[\da-z]{10})'
+ IE_DESC = 'shared.sx and vivo.sx'
+ _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})'
- _TEST = {
+ _TESTS = [{
'url': 'http://shared.sx/0060718775',
'md5': '106fefed92a8a2adb8c98e6a0652f49b',
'info_dict': {
'id': '0060718775',
'ext': 'mp4',
'title': 'Bmp4',
+ 'filesize': 1720110,
},
- }
+ }, {
+ 'url': 'http://vivo.sx/d7ddda0e78',
+ 'md5': '15b3af41be0b4fe01f4df075c2678b2c',
+ 'info_dict': {
+ 'id': 'd7ddda0e78',
+ 'ext': 'mp4',
+ 'title': 'Chicken',
+ 'filesize': 528031,
+ },
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -35,8 +45,7 @@ class SharedIE(InfoExtractor):
raise ExtractorError(
'Video %s does not exist' % video_id, expected=True)
- download_form = dict(re.findall(
- r'<input type="hidden" name="([^"]+)" value="([^"]*)"', webpage))
+ download_form = self._hidden_inputs(webpage)
request = compat_urllib_request.Request(
url, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -47,7 +56,7 @@ class SharedIE(InfoExtractor):
video_url = self._html_search_regex(
r'data-url="([^"]+)"', video_page, 'video URL')
title = base64.b64decode(self._html_search_meta(
- 'full:title', webpage, 'title')).decode('utf-8')
+ 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
filesize = int_or_none(self._html_search_meta(
'full:size', webpage, 'file size', fatal=False))
thumbnail = self._html_search_regex(
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 24746a09a..35a81ee87 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -53,7 +53,7 @@ class SmotriIE(InfoExtractor):
'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
},
},
- # video-password
+ # video-password, not approved by moderator
{
'url': 'http://smotri.com/video/view/?id=v1390466a13c',
'md5': 'f6331cef33cad65a0815ee482a54440b',
@@ -71,7 +71,24 @@ class SmotriIE(InfoExtractor):
},
'skip': 'Video is not approved by moderator',
},
- # age limit + video-password
+ # video-password
+ {
+ 'url': 'http://smotri.com/video/view/?id=v6984858774#',
+ 'md5': 'f11e01d13ac676370fc3b95b9bda11b0',
+ 'info_dict': {
+ 'id': 'v6984858774',
+ 'ext': 'mp4',
+ 'title': 'Дача Солженицина ПАРОЛЬ 223322',
+ 'uploader': 'psavari1',
+ 'uploader_id': 'psavari1',
+ 'upload_date': '20081103',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'videopassword': '223322',
+ },
+ },
+ # age limit + video-password, not approved by moderator
{
'url': 'http://smotri.com/video/view/?id=v15408898bcf',
'md5': '91e909c9f0521adf5ee86fbe073aad70',
@@ -90,19 +107,22 @@ class SmotriIE(InfoExtractor):
},
'skip': 'Video is not approved by moderator',
},
- # not approved by moderator, but available
+ # age limit + video-password
{
- 'url': 'http://smotri.com/video/view/?id=v28888533b73',
- 'md5': 'f44bc7adac90af518ef1ecf04893bb34',
+ 'url': 'http://smotri.com/video/view/?id=v7780025814',
+ 'md5': 'b4599b068422559374a59300c5337d72',
'info_dict': {
- 'id': 'v28888533b73',
+ 'id': 'v7780025814',
'ext': 'mp4',
- 'title': 'Russian Spies Killed By ISIL Child Soldier',
- 'uploader': 'Mopeder',
- 'uploader_id': 'mopeder',
- 'duration': 71,
- 'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg',
- 'upload_date': '20150114',
+ 'title': 'Sexy Beach (пароль 123)',
+ 'uploader': 'вАся',
+ 'uploader_id': 'asya_prosto',
+ 'upload_date': '20081218',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'videopassword': '123'
},
},
# swf player
@@ -152,6 +172,10 @@ class SmotriIE(InfoExtractor):
'getvideoinfo': '1',
}
+ video_password = self._downloader.params.get('videopassword', None)
+ if video_password:
+ video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
+
request = compat_urllib_request.Request(
'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -161,13 +185,18 @@ class SmotriIE(InfoExtractor):
video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
if not video_url:
- if video.get('_moderate_no') or not video.get('moderated'):
+ if video.get('_moderate_no'):
raise ExtractorError(
'Video %s has not been approved by moderator' % video_id, expected=True)
if video.get('error'):
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ if video.get('_pass_protected') == 1:
+ msg = ('Invalid video password' if video_password
+ else 'This video is protected by a password, use the --video-password option')
+ raise ExtractorError(msg, expected=True)
+
title = video['title']
thumbnail = video['_imgURL']
upload_date = unified_strdate(video['added'])
@@ -301,10 +330,7 @@ class SmotriBroadcastIE(InfoExtractor):
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError(
- 'Erotic broadcasts allowed only for registered users, '
- 'use --username and --password options to provide account credentials.',
- expected=True)
+ self.raise_login_required('Erotic broadcasts allowed only for registered users')
login_form = {
'login-hint53': '1',
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py
new file mode 100644
index 000000000..6977afb27
--- /dev/null
+++ b/youtube_dl/extractor/snagfilms.py
@@ -0,0 +1,181 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ parse_duration,
+)
+
+
+class SnagFilmsEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
+ _TESTS = [{
+ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+ 'md5': '2924e9215c6eff7a55ed35b72276bd93',
+ 'info_dict': {
+ 'id': '74849a00-85a9-11e1-9660-123139220831',
+ 'ext': 'mp4',
+ 'title': '#whilewewatch',
+ }
+ }, {
+ # invalid labels, 360p is better that 480p
+ 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
+ 'md5': '882fca19b9eb27ef865efeeaed376a48',
+ 'info_dict': {
+ 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
+ 'ext': 'mp4',
+ 'title': 'Life in Limbo',
+ }
+ }, {
+ 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ if '>This film is not playable in your area.<' in webpage:
+ raise ExtractorError(
+ 'Film %s is not playable in your area.' % video_id, expected=True)
+
+ formats = []
+ for source in self._parse_json(js_to_json(self._search_regex(
+ r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
+ file_ = source.get('file')
+ if not file_:
+ continue
+ type_ = source.get('type')
+ ext = determine_ext(file_)
+ format_id = source.get('label') or ext
+ if all(v == 'm3u8' for v in (type_, ext)):
+ formats.extend(self._extract_m3u8_formats(
+ file_, video_id, 'mp4', m3u8_id='hls'))
+ else:
+ bitrate = int_or_none(self._search_regex(
+ [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
+ file_, 'bitrate', default=None))
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append({
+ 'url': file_,
+ 'format_id': format_id,
+ 'tbr': bitrate,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
+ webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ }
+
+
+class SnagFilmsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+ 'md5': '19844f897b35af219773fd63bdec2942',
+ 'info_dict': {
+ 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+ 'display_id': 'lost_for_life',
+ 'ext': 'mp4',
+ 'title': 'Lost for Life',
+ 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 4489,
+ 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
+ }
+ }, {
+ 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
+ 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
+ 'info_dict': {
+ 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
+ 'display_id': 'the_world_cut_project/india',
+ 'ext': 'mp4',
+ 'title': 'India',
+ 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 979,
+ 'categories': ['Documentary', 'Sports', 'Politics']
+ }
+ }, {
+ # Film is not playable in your area.
+ 'url': 'http://www.snagfilms.com/films/title/inside_mecca',
+ 'only_matching': True,
+ }, {
+ # Film is not available.
+ 'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ if ">Sorry, the Film you're looking for is not available.<" in webpage:
+ raise ExtractorError(
+ 'Film %s is not available.' % display_id, expected=True)
+
+ film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
+
+ snag = self._parse_json(
+ self._search_regex(
+ 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
+ display_id)
+
+ for item in snag:
+ if item.get('data', {}).get('film', {}).get('id') == film_id:
+ data = item['data']['film']
+ title = data['title']
+ description = clean_html(data.get('synopsis'))
+ thumbnail = data.get('image')
+ duration = int_or_none(data.get('duration') or data.get('runtime'))
+ categories = [
+ category['title'] for category in data.get('categories', [])
+ if category.get('title')]
+ break
+ else:
+ title = self._search_regex(
+ r'itemprop="title">([^<]+)<', webpage, 'title')
+ description = self._html_search_regex(
+ r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
+ webpage, 'description', default=None) or self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = parse_duration(self._search_regex(
+ r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
+ webpage, 'duration', fatal=False))
+ categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
+ 'id': film_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'categories': categories,
+ }
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644
index b5fa6f1da..000000000
--- a/youtube_dl/extractor/sockshare.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- determine_ext,
- ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
- _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
- _TEST = {
- 'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
- 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
- 'info_dict': {
- 'id': '437BE28B89D799D7',
- 'title': 'big_buck_bunny_720p_surround.avi',
- 'ext': 'avi',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://sockshare.com/file/%s' % video_id
- webpage = self._download_webpage(url, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- confirm_hash = self._html_search_regex(r'''(?x)<input\s+
- type="hidden"\s+
- value="([^"]*)"\s+
- name="hash"
- ''', webpage, 'hash')
-
- fields = {
- "hash": confirm_hash.encode('utf-8'),
- "confirm": "Continue as Free User"
- }
-
- post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
- # Apparently, this header is required for confirmation to work.
- req.add_header('Host', 'www.sockshare.com')
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- webpage = self._download_webpage(
- req, video_id, 'Downloading video page')
-
- video_url = self._html_search_regex(
- r'<a href="([^"]*)".+class="download_file_link"',
- webpage, 'file url')
- video_url = "http://www.sockshare.com" + video_url
- title = self._html_search_regex((
- r'<h1>(.+)<strong>',
- r'var name = "([^"]+)";'),
- webpage, 'title', default=None)
- thumbnail = self._html_search_regex(
- r'<img\s+src="([^"]*)".+?name="bg"',
- webpage, 'thumbnail', default=None)
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'ext': determine_ext(title),
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index f8a4840f7..ba2d5e19b 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -6,9 +6,12 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urllib_request
+ compat_urllib_request,
+ compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
)
-from ..utils import sanitize_url_path_consecutive_slashes
class SohuIE(InfoExtractor):
@@ -23,9 +26,7 @@ class SohuIE(InfoExtractor):
'ext': 'mp4',
'title': 'MV:Far East Movement《The Illest》',
},
- 'params': {
- 'cn_verification_proxy': 'proxy.uku.im:8888'
- }
+ 'skip': 'On available in China',
}, {
'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
'md5': '699060e75cf58858dd47fb9c03c42cfb',
@@ -117,6 +118,15 @@ class SohuIE(InfoExtractor):
r'var vid ?= ?["\'](\d+)["\']',
webpage, 'video path')
vid_data = _fetch_data(vid, mytv)
+ if vid_data['play'] != 1:
+ if vid_data.get('status') == 12:
+ raise ExtractorError(
+ 'Sohu said: There\'s something wrong in the video.',
+ expected=True)
+ else:
+ raise ExtractorError(
+ 'Sohu said: The video is only licensed to users in Mainland China.',
+ expected=True)
formats_json = {}
for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
@@ -133,23 +143,41 @@ class SohuIE(InfoExtractor):
formats = []
for format_id, format_data in formats_json.items():
allot = format_data['allot']
- prot = format_data['prot']
data = format_data['data']
clips_url = data['clipsURL']
su = data['su']
- part_str = self._download_webpage(
- 'http://%s/?prot=%s&file=%s&new=%s' %
- (allot, prot, clips_url[i], su[i]),
- video_id,
- 'Downloading %s video URL part %d of %d'
- % (format_id, i + 1, part_count))
+ video_url = 'newflv.sohu.ccgslb.net'
+ cdnId = None
+ retries = 0
+
+ while 'newflv.sohu.ccgslb.net' in video_url:
+ params = {
+ 'prot': 9,
+ 'file': clips_url[i],
+ 'new': su[i],
+ 'prod': 'flash',
+ }
+
+ if cdnId is not None:
+ params['idc'] = cdnId
+
+ download_note = 'Downloading %s video URL part %d of %d' % (
+ format_id, i + 1, part_count)
+
+ if retries > 0:
+ download_note += ' (retry #%d)' % retries
+ part_info = self._parse_json(self._download_webpage(
+ 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+ video_id, download_note), video_id)
- part_info = part_str.split('|')
+ video_url = part_info['url']
+ cdnId = part_info.get('nid')
- video_url = sanitize_url_path_consecutive_slashes(
- '%s%s?key=%s' % (part_info[0], su[i], part_info[3]))
+ retries += 1
+ if retries > 5:
+ raise ExtractorError('Failed to get video URL')
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644
index 000000000..5da66ca9e
--- /dev/null
+++ b/youtube_dl/extractor/soompi.py
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ remove_start,
+ xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+ def _get_episodes(self, webpage, episode_filter=None):
+ episodes = self._parse_json(
+ self._search_regex(
+ r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+ None)
+ return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+ IE_NAME = 'soompi'
+ _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/watch/29235',
+ 'info_dict': {
+ 'id': '29235',
+ 'ext': 'mp4',
+ 'title': 'Episode 1096',
+ 'description': '2015-05-20'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _get_episode(self, webpage, video_id):
+ return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+ def _get_subtitles(self, config, video_id):
+ sub_langs = {}
+ for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+ sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+ subtitles = {}
+ for s in config.findall('./{default}preload/subtitle'):
+ lang_code = sub_langs.get(s.attrib['id'])
+ if not lang_code:
+ continue
+ sub_id = s.get('id')
+ data = xpath_text(s, './data', 'data')
+ iv = xpath_text(s, './iv', 'iv')
+ if not id or not iv or not data:
+ continue
+ subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ try:
+ webpage = self._download_webpage(
+ url, video_id, 'Downloading episode page')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ webpage = ee.cause.read()
+ block_message = self._html_search_regex(
+ r'(?s)<div class="block-message">(.+?)</div>', webpage,
+ 'block message', default=None)
+ if block_message:
+ raise ExtractorError(block_message, expected=True)
+ raise
+
+ formats = []
+ config = None
+ for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+ config = self._download_xml(
+ 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+ video_id, 'Downloading %s XML' % format_id)
+ m3u8_url = xpath_text(
+ config, './{default}preload/stream_info/file',
+ '%s m3u8 URL' % format_id)
+ if not m3u8_url:
+ continue
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+ self._sort_formats(formats)
+
+ episode = self._get_episode(webpage, video_id)
+
+ title = episode['name']
+ description = episode.get('description')
+ duration = int_or_none(episode.get('duration'))
+
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+ subtitles = self.extract_subtitles(config, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class SoompiShowIE(SoompiBaseIE):
+ IE_NAME = 'soompi:show'
+ _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/shows/liar-game',
+ 'info_dict': {
+ 'id': 'liar-game',
+ 'title': 'Liar Game',
+ 'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+ },
+ 'playlist_count': 14,
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ url, show_id, 'Downloading show page')
+
+ title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+ description = self._og_search_description(webpage)
+
+ entries = [
+ self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+ for episode in self._get_episodes(webpage)]
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 183ff50f4..ed5dcc0d3 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?P<uploader>[\w\d-]+)/
- (?!sets/|likes/?(?:$|[?#]))
+ (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@@ -282,66 +282,150 @@ class SoundcloudSetIE(SoundcloudIE):
msgs = (compat_str(err['error_message']) for err in info['errors'])
raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
+ entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']]
+
return {
'_type': 'playlist',
- 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']],
+ 'entries': entries,
'id': '%s' % info['id'],
'title': info['title'],
}
class SoundcloudUserIE(SoundcloudIE):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:(?:www|m)\.)?soundcloud\.com/
+ (?P<user>[^/]+)
+ (?:/
+ (?P<rsrc>tracks|sets|reposts|likes|spotlight)
+ )?
+ /?(?:[?#].*)?$
+ '''
IE_NAME = 'soundcloud:user'
_TESTS = [{
- 'url': 'https://soundcloud.com/the-concept-band',
+ 'url': 'https://soundcloud.com/the-akashic-chronicler',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (All)',
+ },
+ 'playlist_mincount': 111,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Tracks)',
+ },
+ 'playlist_mincount': 50,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/sets',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Playlists)',
+ },
+ 'playlist_mincount': 3,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts',
'info_dict': {
- 'id': '9615865',
- 'title': 'The Royal Concept',
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Reposts)',
},
- 'playlist_mincount': 12
+ 'playlist_mincount': 7,
}, {
- 'url': 'https://soundcloud.com/the-concept-band/likes',
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/likes',
'info_dict': {
- 'id': '9615865',
- 'title': 'The Royal Concept',
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Likes)',
+ },
+ 'playlist_mincount': 321,
+ }, {
+ 'url': 'https://soundcloud.com/grynpyret/spotlight',
+ 'info_dict': {
+ 'id': '7098329',
+ 'title': 'Grynpyret (Spotlight)',
},
'playlist_mincount': 1,
}]
+ _API_BASE = 'https://api.soundcloud.com'
+ _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+ _BASE_URL_MAP = {
+ 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
+ 'tracks': '%s/users/%%s/tracks' % _API_BASE,
+ 'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
+ 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
+ 'likes': '%s/users/%%s/likes' % _API_V2_BASE,
+ 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
+ }
+
+ _TITLE_MAP = {
+ 'all': 'All',
+ 'tracks': 'Tracks',
+ 'sets': 'Playlists',
+ 'reposts': 'Reposts',
+ 'likes': 'Likes',
+ 'spotlight': 'Spotlight',
+ }
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user')
- resource = mobj.group('rsrc')
- if resource is None:
- resource = 'tracks'
- elif resource == 'likes':
- resource = 'favorites'
url = 'http://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url)
user = self._download_json(
resolv_url, uploader, 'Downloading user info')
- base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource)
+
+ resource = mobj.group('rsrc') or 'all'
+ base_url = self._BASE_URL_MAP[resource] % user['id']
+
+ next_href = None
entries = []
for i in itertools.count():
- data = compat_urllib_parse.urlencode({
- 'offset': i * 50,
- 'limit': 50,
- 'client_id': self._CLIENT_ID,
- })
- new_entries = self._download_json(
- base_url + data, uploader, 'Downloading track page %s' % (i + 1))
- if len(new_entries) == 0:
+ if not next_href:
+ data = compat_urllib_parse.urlencode({
+ 'offset': i * 50,
+ 'limit': 50,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': '1',
+ 'representation': 'speedy',
+ })
+ next_href = base_url + '?' + data
+
+ response = self._download_json(
+ next_href, uploader, 'Downloading track page %s' % (i + 1))
+
+ collection = response['collection']
+
+ if not collection:
self.to_screen('%s: End page received' % uploader)
break
- entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)
+
+ def resolve_permalink_url(candidates):
+ for cand in candidates:
+ if isinstance(cand, dict):
+ permalink_url = cand.get('permalink_url')
+ if permalink_url and permalink_url.startswith('http'):
+ return permalink_url
+
+ for e in collection:
+ permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
+ if permalink_url:
+ entries.append(self.url_result(permalink_url))
+
+ if 'next_href' in response:
+ next_href = response['next_href']
+ if not next_href:
+ break
+ else:
+ next_href = None
return {
'_type': 'playlist',
'id': compat_str(user['id']),
- 'title': user['username'],
+ 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
'entries': entries,
}
@@ -376,9 +460,7 @@ class SoundcloudPlaylistIE(SoundcloudIE):
data = self._download_json(
base_url + data, playlist_id, 'Downloading playlist')
- entries = [
- self._extract_info_dict(t, quiet=True, secret_token=token)
- for t in data['tracks']]
+ entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']]
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py
index e3b73295c..87b650468 100644
--- a/youtube_dl/extractor/southpark.py
+++ b/youtube_dl/extractor/southpark.py
@@ -32,7 +32,7 @@ class SouthParkEsIE(SouthParkIE):
}]
-class SouthparkDeIE(SouthParkIE):
+class SouthParkDeIE(SouthParkIE):
IE_NAME = 'southpark.de'
_VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
@@ -45,4 +45,34 @@ class SouthparkDeIE(SouthParkIE):
'title': 'The Government Won\'t Respect My Privacy',
'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
},
+ }, {
+ # non-ASCII characters in initial URL
+ 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
+ 'playlist_count': 4,
+ }, {
+ # non-ASCII characters in redirect URL
+ 'url': 'http://www.southpark.de/alle-episoden/s18e09',
+ 'playlist_count': 4,
+ }]
+
+
+class SouthParkNlIE(SouthParkIE):
+ IE_NAME = 'southpark.nl'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+ _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/'
+
+ _TESTS = [{
+ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
+ 'playlist_count': 4,
+ }]
+
+
+class SouthParkDkIE(SouthParkIE):
+ IE_NAME = 'southparkstudios.dk'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+ _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/'
+
+ _TESTS = [{
+ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
+ 'playlist_count': 4,
}]
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index b936202f6..9e8fb35b2 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -4,7 +4,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
compat_urllib_request,
)
@@ -16,8 +16,9 @@ from ..aes import aes_decrypt_text
class SpankwireIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)'
+ _TESTS = [{
+ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
'md5': '8bbfde12b101204b39e4b9fe7eb67095',
'info_dict': {
@@ -27,24 +28,37 @@ class SpankwireIE(InfoExtractor):
'description': 'Crazy Bitch X rated music video.',
'uploader': 'oreusz',
'uploader_id': '124697',
- 'upload_date': '20070508',
+ 'upload_date': '20070507',
'age_limit': 18,
}
- }
+ }, {
+ # download URL pattern: */mp4_<format_id>_<video_id>.mp4
+ 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/',
+ 'md5': '09b3c20833308b736ae8902db2f8d7e6',
+ 'info_dict': {
+ 'id': '1921551',
+ 'ext': 'mp4',
+ 'title': 'Titcums Compiloation I',
+ 'description': 'cum on tits',
+ 'uploader': 'dannyh78999',
+ 'uploader_id': '3056053',
+ 'upload_date': '20150822',
+ 'age_limit': 18,
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ video_id = mobj.group('id')
- req = compat_urllib_request.Request(url)
+ req = compat_urllib_request.Request('http://www.' + mobj.group('url'))
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
title = self._html_search_regex(
r'<h1>([^<]+)', webpage, 'title')
description = self._html_search_regex(
- r'<div\s+id="descriptionContent">([^<]+)<',
+ r'(?s)<div\s+id="descriptionContent">(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
@@ -54,7 +68,7 @@ class SpankwireIE(InfoExtractor):
r'by:\s*<a [^>]*>(.+?)</a>',
webpage, 'uploader', fatal=False)
uploader_id = self._html_search_regex(
- r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"',
+ r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"',
webpage, 'uploader id', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
r'</a> on (.+?) at \d+:\d+',
@@ -64,14 +78,15 @@ class SpankwireIE(InfoExtractor):
r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
- r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+ r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',
webpage, 'comment count', fatal=False))
- video_urls = list(map(
- compat_urllib_parse.unquote,
- re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
+ videos = re.findall(
+ r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage)
+ heights = [int(video[0]) for video in videos]
+ video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos]))
if webpage.find('flashvars\.encrypted = "true"') != -1:
- password = self._html_search_regex(
+ password = self._search_regex(
r'flashvars\.video_title = "([^"]+)',
webpage, 'password').replace('+', ' ')
video_urls = list(map(
@@ -79,21 +94,22 @@ class SpankwireIE(InfoExtractor):
video_urls))
formats = []
- for video_url in video_urls:
+ for height, video_url in zip(heights, video_urls):
path = compat_urllib_parse_urlparse(video_url).path
- format = path.split('/')[4].split('_')[:2]
- resolution, bitrate_str = format
- format = "-".join(format)
- height = int(resolution.rstrip('Pp'))
- tbr = int(bitrate_str.rstrip('Kk'))
- formats.append({
+ _, quality = path.split('/')[4].split('_')[:2]
+ f = {
'url': video_url,
- 'resolution': resolution,
- 'format': format,
- 'tbr': tbr,
'height': height,
- 'format_id': format,
- })
+ }
+ tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None)
+ if tbr:
+ f.update({
+ 'tbr': int(tbr),
+ 'format_id': '%dp' % height,
+ })
+ else:
+ f['format_id'] = quality
+ formats.append(f)
self._sort_formats(formats)
age_limit = self._rta_search(webpage)
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index b868241d5..5bd3c0087 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE
class SpiegelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$'
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
'md5': '2c2754212136f35fb4b19767d242f66e',
@@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor):
'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
}
+ }, {
+ 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index 98cf92d89..27f4033c5 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -2,7 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ determine_ext,
+ float_or_none,
+)
class SpiegeltvIE(InfoExtractor):
@@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg$',
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
}
}, {
@@ -51,9 +55,39 @@ class SpiegeltvIE(InfoExtractor):
is_wide = media_json['is_wide']
server_json = self._download_json(
- 'http://www.spiegel.tv/streaming_servers/', video_id,
- note='Downloading server information')
- server = server_json[0]['endpoint']
+ 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+ video_id, note='Downloading server information')
+
+ format = '16x9' if is_wide else '4x3'
+
+ formats = []
+ for streamingserver in server_json['streamingserver']:
+ endpoint = streamingserver.get('endpoint')
+ if not endpoint:
+ continue
+ play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
+ if endpoint.startswith('rtmp'):
+ formats.append({
+ 'url': endpoint,
+ 'format_id': 'rtmp',
+ 'app': compat_urllib_parse_urlparse(endpoint).path[1:],
+ 'play_path': play_path,
+ 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
+ 'ext': 'flv',
+ 'rtmp_live': True,
+ })
+ elif determine_ext(endpoint) == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ endpoint.replace('[video]', play_path),
+ video_id, 'm4v',
+ preference=1, # Prefer hls since it allows to workaround georestriction
+ m3u8_id='hls', fatal=False)
+ if m3u8_formats is not False:
+ formats.extend(m3u8_formats)
+ else:
+ formats.append({
+ 'url': endpoint,
+ })
thumbnails = []
for image in media_json['images']:
@@ -65,16 +99,12 @@ class SpiegeltvIE(InfoExtractor):
description = media_json['subtitle']
duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
- format = '16x9' if is_wide else '4x3'
-
- url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
return {
'id': video_id,
'title': title,
- 'url': url,
- 'ext': 'm4v',
'description': description,
'duration': duration,
- 'thumbnails': thumbnails
+ 'thumbnails': thumbnails,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index becdf658f..86d509ae5 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -4,37 +4,36 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
- parse_duration,
- parse_iso8601,
+ unified_strdate,
)
class SportBoxIE(InfoExtractor):
- _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
- _TESTS = [
- {
- 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
- 'md5': 'ff56a598c2cf411a9a38a69709e97079',
- 'info_dict': {
- 'id': '80822',
- 'ext': 'mp4',
- 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
- 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'timestamp': 1411896237,
- 'upload_date': '20140928',
- 'duration': 4846,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
- 'only_matching': True,
- }
- ]
+ _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+ _TESTS = [{
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
+ 'md5': 'ff56a598c2cf411a9a38a69709e97079',
+ 'info_dict': {
+ 'id': '80822',
+ 'ext': 'mp4',
+ 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
+ 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140928',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+ player = self._search_regex(
+ r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
+
+ title = self._html_search_regex(
+ [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+ webpage, 'title')
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._html_search_meta(
+ 'dateCreated', webpage, 'upload date'))
+
+ return {
+ '_type': 'url_transparent',
+ 'url': compat_urlparse.urljoin(url, '/%s' % player),
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
- player = self._download_webpage(
- 'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
- display_id, 'Downloading player webpage')
+
+class SportBoxEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
+ 'info_dict': {
+ 'id': '211355',
+ 'ext': 'mp4',
+ 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
+ webpage)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
hls = self._search_regex(
- r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
+ r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
+ webpage, 'hls file')
- formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
+ formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
- title = self._html_search_regex(
- r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
- description = self._html_search_regex(
- r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
- timestamp = parse_iso8601(self._search_regex(
- r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
- duration = parse_duration(self._html_search_regex(
- r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+ title = self._search_regex(
+ r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
+
+ thumbnail = self._search_regex(
+ r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
+ webpage, 'thumbnail', default=None)
return {
'id': video_id,
- 'display_id': display_id,
'title': title,
- 'description': description,
'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
'formats': formats,
}
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
index 1a57aebf1..7ec6c613f 100644
--- a/youtube_dl/extractor/sportdeutschland.py
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -38,10 +38,12 @@ class SportDeutschlandIE(InfoExtractor):
'upload_date': '20140825',
'description': 'md5:60a20536b57cee7d9a4ec005e8687504',
'timestamp': 1408976060,
+ 'duration': 2732,
'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee',
'thumbnail': 're:^https?://.*\.jpg$',
'view_count': int,
'categories': ['Li-Ning Badminton WM 2014'],
+
}
}]
@@ -50,7 +52,7 @@ class SportDeutschlandIE(InfoExtractor):
video_id = mobj.group('id')
sport_id = mobj.group('sport')
- api_url = 'http://splink.tv/api/permalinks/%s/%s' % (
+ api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % (
sport_id, video_id)
req = compat_urllib_request.Request(api_url, headers={
'Accept': 'application/vnd.vidibus.v2.html+json',
@@ -58,12 +60,11 @@ class SportDeutschlandIE(InfoExtractor):
})
data = self._download_json(req, video_id)
- categories = list(data.get('section', {}).get('tags', {}).values())
asset = data['asset']
- assets_info = self._download_json(asset['url'], video_id)
+ categories = [data['section']['title']]
formats = []
- smil_url = assets_info['video']
+ smil_url = asset['video']
if '.smil' in smil_url:
m3u8_url = smil_url.replace('.smil', '.m3u8')
formats.extend(
@@ -91,6 +92,7 @@ class SportDeutschlandIE(InfoExtractor):
'title': asset['title'],
'thumbnail': asset.get('image'),
'description': asset.get('teaser'),
+ 'duration': asset.get('duration'),
'categories': categories,
'view_count': asset.get('views'),
'rtmp_live': asset.get('live'),
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
index 854d01bee..e527aa971 100644
--- a/youtube_dl/extractor/sunporno.py
+++ b/youtube_dl/extractor/sunporno.py
@@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor):
webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
- r'class="views">\s*(\d+)\s*<',
+ r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._html_search_regex(
r'(\d+)</b> Comments?',
diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svt.py
index 433dfd1cb..fc20f664b 100644
--- a/youtube_dl/extractor/svtplay.py
+++ b/youtube_dl/extractor/svt.py
@@ -9,41 +9,9 @@ from ..utils import (
)
-class SVTPlayIE(InfoExtractor):
- IE_DESC = 'SVT Play and Öppet arkiv'
- _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
- _TESTS = [{
- 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
- 'md5': 'ade3def0643fa1c40587a422f98edfd9',
- 'info_dict': {
- 'id': '2609989',
- 'ext': 'flv',
- 'title': 'SM veckan vinter, Örebro - Rally, final',
- 'duration': 4500,
- 'thumbnail': 're:^https?://.*[\.-]jpg$',
- 'age_limit': 0,
- },
- }, {
- 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
- 'md5': 'c3101a17ce9634f4c1f9800f0746c187',
- 'info_dict': {
- 'id': '1058509',
- 'ext': 'flv',
- 'title': 'Farlig kryssning',
- 'duration': 2566,
- 'thumbnail': 're:^https?://.*[\.-]jpg$',
- 'age_limit': 0,
- },
- 'skip': 'Only works from Sweden',
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- host = mobj.group('host')
-
- info = self._download_json(
- 'http://www.%s.se/video/%s?output=json' % (host, video_id), video_id)
+class SVTBaseIE(InfoExtractor):
+ def _extract_video(self, url, video_id):
+ info = self._download_json(url, video_id)
title = info['context']['title']
thumbnail = info['context'].get('thumbnailImage')
@@ -80,3 +48,70 @@ class SVTPlayIE(InfoExtractor):
'duration': duration,
'age_limit': age_limit,
}
+
+
+class SVTIE(SVTBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
+ 'md5': '9648197555fc1b49e3dc22db4af51d46',
+ 'info_dict': {
+ 'id': '2900353',
+ 'ext': 'flv',
+ 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+ 'duration': 27,
+ 'age_limit': 0,
+ },
+ }
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ widget_id = mobj.group('widget_id')
+ article_id = mobj.group('id')
+ return self._extract_video(
+ 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
+ article_id)
+
+
+class SVTPlayIE(SVTBaseIE):
+ IE_DESC = 'SVT Play and Öppet arkiv'
+ _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
+ 'md5': 'ade3def0643fa1c40587a422f98edfd9',
+ 'info_dict': {
+ 'id': '2609989',
+ 'ext': 'flv',
+ 'title': 'SM veckan vinter, Örebro - Rally, final',
+ 'duration': 4500,
+ 'thumbnail': 're:^https?://.*[\.-]jpg$',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
+ 'md5': 'c3101a17ce9634f4c1f9800f0746c187',
+ 'info_dict': {
+ 'id': '1058509',
+ 'ext': 'flv',
+ 'title': 'Farlig kryssning',
+ 'duration': 2566,
+ 'thumbnail': 're:^https?://.*[\.-]jpg$',
+ 'age_limit': 0,
+ },
+ 'skip': 'Only works from Sweden',
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
+ return self._extract_video(
+ 'http://www.%s.se/video/%s?output=json' % (host, video_id),
+ video_id)
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
index bfe07b024..73e7657d4 100644
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -8,17 +8,17 @@ from ..utils import parse_filesize
class TagesschauIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html'
_TESTS = [{
- 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
- 'md5': 'bcdeac2194fb296d599ce7929dfa4009',
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
+ 'md5': '917a228bc7df7850783bc47979673a09',
'info_dict': {
- 'id': '1399128',
+ 'id': '102143',
'ext': 'mp4',
- 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
- 'description': 'md5:69da3c61275b426426d711bde96463ab',
- 'thumbnail': 're:^http:.*\.jpg$',
+ 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
+ 'description': 'md5:171feccd9d9b3dd54d05d501568f6359',
+ 'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
@@ -28,8 +28,39 @@ class TagesschauIE(InfoExtractor):
'ext': 'mp4',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
- 'thumbnail': 're:^http:.*\.jpg$',
- }
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
+ 'md5': 'aef45de271c4bf0a5db834aa40bf774c',
+ 'info_dict': {
+ 'id': '18407',
+ 'ext': 'mp3',
+ 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+ 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
+ 'only_matching': True,
}]
_FORMATS = {
@@ -49,19 +80,26 @@ class TagesschauIE(InfoExtractor):
playerpage = self._download_webpage(
player_url, display_id, 'Downloading player page')
- medias = re.findall(
- r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
- playerpage)
formats = []
- for url, ext, res in medias:
+ for media in re.finditer(
+ r'''(?x)
+ (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
+ ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
+ (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
+ ''', playerpage):
+ url = media.group('url')
+ type_ = media.group('type')
+ ext = media.group('ext')
+ res = media.group('quality')
f = {
- 'format_id': res + '_' + ext,
+ 'format_id': '%s_%s' % (res, ext) if res else ext,
'url': url,
'ext': ext,
+ 'vcodec': 'none' if type_ == 'audio' else None,
}
f.update(self._FORMATS.get(res, {}))
formats.append(f)
- thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+ thumbnail = self._og_search_thumbnail(playerpage)
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
else:
@@ -99,17 +137,14 @@ class TagesschauIE(InfoExtractor):
'filesize_approx': parse_filesize(m.group('filesize_approx')),
})
formats.append(format)
- thumbnail_fn = self._search_regex(
- r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
description = self._html_search_regex(
r'(?s)<p class="teasertext">(.*?)</p>',
- webpage, 'description', fatal=False)
+ webpage, 'description', default=None)
title = self._html_search_regex(
r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
self._sort_formats(formats)
- thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
return {
'id': display_id,
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index 2381676b4..d1b7264b4 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -2,13 +2,17 @@
from __future__ import unicode_literals
import base64
+import binascii
import re
+import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
qualities,
+ determine_ext,
)
+from ..compat import compat_ord
class TeamcocoIE(InfoExtractor):
@@ -47,6 +51,17 @@ class TeamcocoIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 downloads
}
+ }, {
+ 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+ 'info_dict': {
+ 'id': '89341',
+ 'ext': 'mp4',
+ 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 downloads
+ }
}
]
_VIDEO_ID_REGEXES = (
@@ -59,40 +74,70 @@ class TeamcocoIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id)
+ webpage, urlh = self._download_webpage_handle(url, display_id)
+ if 'src=expired' in urlh.geturl():
+ raise ExtractorError('This video is expired.', expected=True)
video_id = mobj.group('video_id')
if not video_id:
video_id = self._html_search_regex(
self._VIDEO_ID_REGEXES, webpage, 'video id')
- preload = None
- preloads = re.findall(r'"preload":\s*"([^"]+)"', webpage)
- if preloads:
- preload = max([(len(p), p) for p in preloads])[1]
-
- if not preload:
- preload = ''.join(re.findall(r'this\.push\("([^"]+)"\);', webpage))
-
- if not preload:
- preload = self._html_search_regex([
- r'player,\[?"([^"]+)"\]?', r'player.init\(\[?"([^"]+)"\]?\)'
- ], webpage.replace('","', ''), 'preload data', default=None)
-
- if not preload:
+ data = None
+
+ preload_codes = self._html_search_regex(
+ r'(function.+)setTimeout\(function\(\)\{playlist',
+ webpage, 'preload codes')
+ base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+ base64_fragments.remove('init')
+
+ def _check_sequence(cur_fragments):
+ if not cur_fragments:
+ return
+ for i in range(len(cur_fragments)):
+ cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii')
+ try:
+ raw_data = base64.b64decode(cur_sequence)
+ if compat_ord(raw_data[0]) == compat_ord('{'):
+ return json.loads(raw_data.decode('utf-8'))
+ except (TypeError, binascii.Error, UnicodeDecodeError, ValueError):
+ continue
+
+ def _check_data():
+ for i in range(len(base64_fragments) + 1):
+ for j in range(i, len(base64_fragments) + 1):
+ data = _check_sequence(base64_fragments[:i] + base64_fragments[j:])
+ if data:
+ return data
+
+ self.to_screen('Try to compute possible data sequence. This may take some time.')
+ data = _check_data()
+
+ if not data:
raise ExtractorError(
'Preload information could not be extracted', expected=True)
- data = self._parse_json(
- base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id)
-
formats = []
get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
for filed in data['files']:
- if filed['type'] == 'hls':
- formats.extend(self._extract_m3u8_formats(
- filed['url'], video_id, ext='mp4'))
+ if determine_ext(filed['url']) == 'm3u8':
+ # compat_urllib_parse.urljoin does not work here
+ if filed['url'].startswith('/'):
+ m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+ else:
+ m3u8_url = filed['url']
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4')
+ for m3u8_format in m3u8_formats:
+ if m3u8_format not in formats:
+ formats.append(m3u8_format)
+ elif determine_ext(filed['url']) == 'f4m':
+ # TODO Correct f4m extraction
+ continue
else:
+ if filed['url'].startswith('/mp4:protected/'):
+ # TODO Correct extraction for these files
+ continue
m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
if m_format is not None:
format_id = m_format.group(1)
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index 251a68680..ae94f055c 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -6,7 +6,7 @@ from .mitele import MiTeleIE
class TelecincoIE(MiTeleIE):
IE_NAME = 'telecinco.es'
- _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/(?:[^/]+/)?(?P<id>.*?)\.html'
+ _VALID_URL = r'https?://www\.telecinco\.es/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
@@ -16,7 +16,14 @@ class TelecincoIE(MiTeleIE):
'title': 'Con Martín Berasategui, hacer un bacalao al ...',
'duration': 662,
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
+ 'only_matching': True,
}]
diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py
new file mode 100644
index 000000000..6f8333cfc
--- /dev/null
+++ b/youtube_dl/extractor/telegraaf.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class TelegraafIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
+ _TEST = {
+ 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
+ 'md5': '83245a9779bcc4a24454bfd53c65b6dc',
+ 'info_dict': {
+ 'id': '24353229',
+ 'ext': 'mp4',
+ 'title': 'Tikibad ontruimd wegens brand',
+ 'description': 'md5:05ca046ff47b931f9b04855015e163a4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 33,
+ },
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ playlist_url = self._search_regex(
+ r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
+
+ entries = self._extract_xspf_playlist(playlist_url, playlist_id)
+ title = remove_end(self._og_search_title(webpage), ' - VIDEO')
+ description = self._og_search_description(webpage)
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
index 466155ef8..f6694149b 100644
--- a/youtube_dl/extractor/tenplay.py
+++ b/youtube_dl/extractor/tenplay.py
@@ -2,6 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ float_or_none,
+)
class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
if protocol == 'rtmp':
url = url.replace('&mp4:', '')
+ tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
formats.append({
- 'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
- 'width': rendition['frameWidth'],
- 'height': rendition['frameHeight'],
- 'tbr': rendition['encodingRate'] / 1024,
- 'filesize': rendition['size'],
+ 'format_id': '_'.join(
+ ['rtmp', rendition['videoContainer'].lower(),
+ rendition['videoCodec'].lower(), '%sk' % tbr]),
+ 'width': int_or_none(rendition['frameWidth']),
+ 'height': int_or_none(rendition['frameHeight']),
+ 'tbr': tbr,
+ 'filesize': int_or_none(rendition['size']),
'protocol': protocol,
'ext': ext,
'vcodec': rendition['videoCodec'].lower(),
'container': rendition['videoContainer'].lower(),
'url': url,
})
+ self._sort_formats(formats)
return {
'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
'url': json['thumbnailURL']
}],
'thumbnail': json['videoStillURL'],
- 'duration': json['length'] / 1000,
- 'timestamp': float(json['creationDate']) / 1000,
- 'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
- 'view_count': json['playsTotal']
+ 'duration': float_or_none(json.get('length'), 1000),
+ 'timestamp': float_or_none(json.get('creationDate'), 1000),
+ 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+ 'view_count': int_or_none(json.get('playsTotal')),
}
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 025d0877c..3a68eaa80 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
- _TESTS = {
+ _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+ _TESTS = [{
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
'id': '10635995',
@@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 92731ad3d..25edc3100 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -1,7 +1,7 @@
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
-import json
import time
import hmac
import binascii
@@ -10,7 +10,8 @@ import hashlib
from .common import InfoExtractor
from ..compat import (
- compat_str,
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
)
from ..utils import (
determine_ext,
@@ -18,15 +19,72 @@ from ..utils import (
xpath_with_ns,
unsmuggle_url,
int_or_none,
+ url_basename,
+ float_or_none,
)
-_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
+default_ns = 'http://www.w3.org/2005/SMIL21/Language'
+_x = lambda p: xpath_with_ns(p, {'smil': default_ns})
-class ThePlatformIE(InfoExtractor):
+class ThePlatformBaseIE(InfoExtractor):
+ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
+ meta = self._download_xml(smil_url, video_id, note=note)
+ try:
+ error_msg = next(
+ n.attrib['abstract']
+ for n in meta.findall(_x('.//smil:ref'))
+ if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
+ except StopIteration:
+ pass
+ else:
+ raise ExtractorError(error_msg, expected=True)
+
+ formats = self._parse_smil_formats(
+ meta, smil_url, video_id, namespace=default_ns,
+ # the parameters are from syfy.com, other sites may use others,
+ # they also work for nbc.com
+ f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
+ transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
+
+ for _format in formats:
+ ext = determine_ext(_format['url'])
+ if ext == 'once':
+ _format['ext'] = 'mp4'
+
+ self._sort_formats(formats)
+
+ subtitles = self._parse_smil_subtitles(meta, default_ns)
+
+ return formats, subtitles
+
+ def get_metadata(self, path, video_id):
+ info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
+ info = self._download_json(info_url, video_id)
+
+ subtitles = {}
+ captions = info.get('captions')
+ if isinstance(captions, list):
+ for caption in captions:
+ lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
+ subtitles[lang] = [{
+ 'ext': 'srt' if mime == 'text/srt' else 'ttml',
+ 'url': src,
+ }]
+
+ return {
+ 'title': info['title'],
+ 'subtitles': subtitles,
+ 'description': info['description'],
+ 'thumbnail': info['defaultThumbnailUrl'],
+ 'duration': int_or_none(info.get('duration'), 1000),
+ }
+
+
+class ThePlatformIE(ThePlatformBaseIE):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
- (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
+ (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
_TESTS = [{
@@ -56,6 +114,31 @@ class ThePlatformIE(InfoExtractor):
# rtmp download
'skip_download': True,
}
+ }, {
+ 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+ 'info_dict': {
+ 'id': 'yMBg9E8KFxZD',
+ 'ext': 'mp4',
+ 'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+ 'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+ }
+ }, {
+ 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
+ 'md5': '734f3790fb5fc4903da391beeebc4836',
+ 'info_dict': {
+ 'id': 'tdy_or_siri_150701',
+ 'ext': 'mp4',
+ 'title': 'iPhone Siri’s sassy response to a math question has people talking',
+ 'description': 'md5:a565d1deadd5086f3331d57298ec6333',
+ 'duration': 83.0,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1435752600,
+ 'upload_date': '20150701',
+ 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
+ },
}]
@staticmethod
@@ -85,6 +168,29 @@ class ThePlatformIE(InfoExtractor):
if not provider_id:
provider_id = 'dJ5BDC'
+ path = provider_id
+ if mobj.group('media'):
+ path += '/media'
+ path += '/' + video_id
+
+ qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ if 'guid' in qs_dict:
+ webpage = self._download_webpage(url, video_id)
+ scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
+ feed_id = None
+ # feed id usually locates in the last script.
+ # Seems there's no pattern for the interested script filename, so
+ # I try one by one
+ for script in reversed(scripts):
+ feed_script = self._download_webpage(script, video_id, 'Downloading feed script')
+ feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None)
+ if feed_id is not None:
+ break
+ if feed_id is None:
+ raise ExtractorError('Unable to find feed id')
+ return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % (
+ provider_id, feed_id, qs_dict['guid'][0]))
+
if smuggled_data.get('force_smil_url', False):
smil_url = url
elif mobj.group('config'):
@@ -92,104 +198,97 @@ class ThePlatformIE(InfoExtractor):
config_url = config_url.replace('swf/', 'config/')
config_url = config_url.replace('onsite/', 'onsite/config/')
config = self._download_json(config_url, video_id, 'Downloading config')
- smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
+ if 'releaseUrl' in config:
+ release_url = config['releaseUrl']
+ else:
+ release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+ smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m'
else:
- smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?'
- 'format=smil&mbr=true'.format(provider_id, video_id))
+ smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
sig = smuggled_data.get('sig')
if sig:
smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
- meta = self._download_xml(smil_url, video_id)
- try:
- error_msg = next(
- n.attrib['abstract']
- for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
- except StopIteration:
- pass
- else:
- raise ExtractorError(error_msg, expected=True)
+ formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
+
+ ret = self.get_metadata(path, video_id)
+ combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
+ ret.update({
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': combined_subtitles,
+ })
+
+ return ret
+
+
+class ThePlatformFeedIE(ThePlatformBaseIE):
+ _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
+ _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
+ _TEST = {
+ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
+ 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
+ 'md5': '22d2b84f058d3586efcd99e57d59d314',
+ 'info_dict': {
+ 'id': 'n_hardball_5biden_140207',
+ 'ext': 'mp4',
+ 'title': 'The Biden factor: will Joe run in 2016?',
+ 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140208',
+ 'timestamp': 1391824260,
+ 'duration': 467.0,
+ 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ provider_id = mobj.group('provider_id')
+ feed_id = mobj.group('feed_id')
- info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id)
- info_json = self._download_webpage(info_url, video_id)
- info = json.loads(info_json)
+ real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
+ feed = self._download_json(real_url, video_id)
+ entry = feed['entries'][0]
+ formats = []
subtitles = {}
- captions = info.get('captions')
- if isinstance(captions, list):
- for caption in captions:
- lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
- subtitles[lang] = [{
- 'ext': 'srt' if mime == 'text/srt' else 'ttml',
- 'url': src,
- }]
+ first_video_id = None
+ duration = None
+ for item in entry['media$content']:
+ smil_url = item['plfile$url'] + '&format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M'
+ cur_video_id = url_basename(smil_url)
+ if first_video_id is None:
+ first_video_id = cur_video_id
+ duration = float_or_none(item.get('plfile$duration'))
+ cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
+ formats.extend(cur_formats)
+ subtitles = self._merge_subtitles(subtitles, cur_subtitles)
- head = meta.find(_x('smil:head'))
- body = meta.find(_x('smil:body'))
+ self._sort_formats(formats)
- f4m_node = body.find(_x('smil:seq//smil:video'))
- if f4m_node is None:
- f4m_node = body.find(_x('smil:seq/smil:video'))
- if f4m_node is not None and '.f4m' in f4m_node.attrib['src']:
- f4m_url = f4m_node.attrib['src']
- if 'manifest.f4m?' not in f4m_url:
- f4m_url += '?'
- # the parameters are from syfy.com, other sites may use others,
- # they also work for nbc.com
- f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
- formats = self._extract_f4m_formats(f4m_url, video_id)
- else:
- formats = []
- switch = body.find(_x('smil:switch'))
- if switch is None:
- switch = body.find(_x('smil:par//smil:switch'))
- if switch is None:
- switch = body.find(_x('smil:par/smil:switch'))
- if switch is None:
- switch = body.find(_x('smil:par'))
- if switch is not None:
- base_url = head.find(_x('smil:meta')).attrib['base']
- for f in switch.findall(_x('smil:video')):
- attr = f.attrib
- width = int_or_none(attr.get('width'))
- height = int_or_none(attr.get('height'))
- vbr = int_or_none(attr.get('system-bitrate'), 1000)
- format_id = '%dx%d_%dk' % (width, height, vbr)
- formats.append({
- 'format_id': format_id,
- 'url': base_url,
- 'play_path': 'mp4:' + attr['src'],
- 'ext': 'flv',
- 'width': width,
- 'height': height,
- 'vbr': vbr,
- })
- else:
- switch = body.find(_x('smil:seq//smil:switch'))
- if switch is None:
- switch = body.find(_x('smil:seq/smil:switch'))
- for f in switch.findall(_x('smil:video')):
- attr = f.attrib
- vbr = int_or_none(attr.get('system-bitrate'), 1000)
- ext = determine_ext(attr['src'])
- if ext == 'once':
- ext = 'mp4'
- formats.append({
- 'format_id': compat_str(vbr),
- 'url': attr['src'],
- 'vbr': vbr,
- 'ext': ext,
- })
- self._sort_formats(formats)
+ thumbnails = [{
+ 'url': thumbnail['plfile$url'],
+ 'width': int_or_none(thumbnail.get('plfile$width')),
+ 'height': int_or_none(thumbnail.get('plfile$height')),
+ } for thumbnail in entry.get('media$thumbnails', [])]
- return {
+ timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
+ categories = [item['media$name'] for item in entry.get('media$categories', [])]
+
+ ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+ subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
+ ret.update({
'id': video_id,
- 'title': info['title'],
- 'subtitles': subtitles,
'formats': formats,
- 'description': info['description'],
- 'thumbnail': info['defaultThumbnailUrl'],
- 'duration': int_or_none(info.get('duration'), 1000),
- }
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'categories': categories,
+ })
+
+ return ret
diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py
index a77c6a2fc..5d09eb9a8 100644
--- a/youtube_dl/extractor/thesixtyone.py
+++ b/youtube_dl/extractor/thesixtyone.py
@@ -1,9 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
from ..utils import unified_strdate
@@ -17,7 +14,7 @@ class TheSixtyOneIE(InfoExtractor):
song
)/(?P<id>[A-Za-z0-9]+)/?$'''
_SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
- _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream'
+ _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
_THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
_TESTS = [
{
@@ -70,14 +67,19 @@ class TheSixtyOneIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- song_id = mobj.group('id')
+ song_id = self._match_id(url)
webpage = self._download_webpage(
self._SONG_URL_TEMPLATE.format(song_id), song_id)
- song_data = json.loads(self._search_regex(
- r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'))
+ song_data = self._parse_json(self._search_regex(
+ r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id)
+
+ if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None):
+ song_data['audio_server'] = 's3.amazonaws.com'
+ else:
+ song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com'
+
keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]
url = self._SONG_FILE_URL_TEMPLATE.format(
"".join(reversed(keys)), **song_data)
diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py
new file mode 100644
index 000000000..36493a5de
--- /dev/null
+++ b/youtube_dl/extractor/thisamericanlife.py
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisAmericanLifeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one',
+ 'md5': '8f7d2da8926298fdfca2ee37764c11ce',
+ 'info_dict': {
+ 'id': '487',
+ 'ext': 'm4a',
+ 'title': '487: Harper High School, Part One',
+ 'description': 'md5:ee40bdf3fb96174a9027f76dbecea655',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.thisamericanlife.org/play_full.php?play=487',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id)
+
+ return {
+ 'id': video_id,
+ 'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id),
+ 'protocol': 'm3u8_native',
+ 'ext': 'm4a',
+ 'acodec': 'aac',
+ 'vcodec': 'none',
+ 'abr': 64,
+ 'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True),
+ 'description': self._html_search_meta(r'description', webpage, 'description'),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index 9f9e388c5..13263614c 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE):
IE_NAME = 'tlc.com'
_VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
- _TEST = {
+ # DiscoveryIE has _TESTS
+ _TESTS = [{
'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
- 'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
'info_dict': {
- 'id': '853232',
+ 'id': '104493',
'ext': 'mp4',
- 'title': 'Cake Boss: Too Big to Fly',
+ 'title': 'Too Big to Fly',
'description': 'Buddy has taken on a high flying task.',
'duration': 119,
+ 'timestamp': 1393365060,
+ 'upload_date': '20140225',
},
- }
+ 'params': {
+ 'skip_download': True, # requires ffmpef
+ },
+ }]
class TlcDeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py
index c5c6fdc51..7dbe68b5c 100644
--- a/youtube_dl/extractor/tmz.py
+++ b/youtube_dl/extractor/tmz.py
@@ -30,3 +30,31 @@ class TMZIE(InfoExtractor):
'description': self._og_search_description(webpage),
'thumbnail': self._html_search_meta('ThumbURL', webpage),
}
+
+
+class TMZArticleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?'
+ _TEST = {
+ 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
+ 'md5': 'e482a414a38db73087450e3a6ce69d00',
+ 'info_dict': {
+ 'id': '0_6snoelag',
+ 'ext': 'mp4',
+ 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
+ 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ embedded_video_info_str = self._html_search_regex(
+ r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info')
+
+ embedded_video_info = self._parse_json(
+ embedded_video_info_str, video_id,
+ transform_source=lambda s: s.replace('\\', ''))
+
+ return self.url_result(
+ 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'])
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index d48cbbf14..49516abca 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -3,33 +3,70 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- parse_duration,
fix_xml_ampersands,
+ float_or_none,
+ int_or_none,
+ parse_duration,
+ str_to_int,
+ xpath_text,
)
-class TNAFlixIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+class TNAFlixNetworkBaseIE(InfoExtractor):
+ # May be overridden in descendants if necessary
+ _CONFIG_REGEX = [
+ r'flashvars\.config\s*=\s*escape\("([^"]+)"',
+ r'<input[^>]+name="config\d?" value="([^"]+)"',
+ ]
+ _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"'
+ _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"'
+ _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"'
+ _VIEW_COUNT_REGEX = None
+ _COMMENT_COUNT_REGEX = None
+ _AVERAGE_RATING_REGEX = None
+ _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>'
- _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
- _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
- _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
+ def _extract_thumbnails(self, flix_xml):
- _TEST = {
- 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
- 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
- 'info_dict': {
- 'id': '553878',
- 'display_id': 'Carmella-Decesare-striptease',
- 'ext': 'mp4',
- 'title': 'Carmella Decesare - striptease',
- 'description': '',
- 'thumbnail': 're:https?://.*\.jpg$',
- 'duration': 91,
- 'age_limit': 18,
- }
- }
+ def get_child(elem, names):
+ for name in names:
+ child = elem.find(name)
+ if child is not None:
+ return child
+
+ timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage'])
+ if timeline is None:
+ return
+
+ pattern_el = get_child(timeline, ['imagePattern', 'pattern'])
+ if pattern_el is None or not pattern_el.text:
+ return
+
+ first_el = get_child(timeline, ['imageFirst', 'first'])
+ last_el = get_child(timeline, ['imageLast', 'last'])
+ if first_el is None or last_el is None:
+ return
+
+ first_text = first_el.text
+ last_text = last_el.text
+ if not first_text.isdigit() or not last_text.isdigit():
+ return
+
+ first = int(first_text)
+ last = int(last_text)
+ if first > last:
+ return
+
+ width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width'))
+ height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height'))
+
+ return [{
+ 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'),
+ 'width': width,
+ 'height': height,
+ } for i in range(first, last + 1)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -38,47 +75,195 @@ class TNAFlixIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- title = self._html_search_regex(
- self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
- description = self._html_search_regex(
- self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
-
- age_limit = self._rta_search(webpage)
-
- duration = self._html_search_meta('duration', webpage, 'duration', default=None)
- if duration:
- duration = parse_duration(duration[1:])
-
cfg_url = self._proto_relative_url(self._html_search_regex(
self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
cfg_xml = self._download_xml(
- cfg_url, display_id, note='Downloading metadata',
+ cfg_url, display_id, 'Downloading metadata',
transform_source=fix_xml_ampersands)
- thumbnail = cfg_xml.find('./startThumb').text
-
formats = []
+
+ def extract_video_url(vl):
+ return re.sub('speed=\d+', 'speed=', vl.text)
+
+ video_link = cfg_xml.find('./videoLink')
+ if video_link is not None:
+ formats.append({
+ 'url': extract_video_url(video_link),
+ 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'),
+ })
+
for item in cfg_xml.findall('./quality/item'):
- video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
- format_id = item.find('res').text
- fmt = {
- 'url': video_url,
+ video_link = item.find('./videoLink')
+ if video_link is None:
+ continue
+ res = item.find('res')
+ format_id = None if res is None else res.text
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_id, 'height', default=None))
+ formats.append({
+ 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),
'format_id': format_id,
- }
- m = re.search(r'^(\d+)', format_id)
- if m:
- fmt['height'] = int(m.group(1))
- formats.append(fmt)
+ 'height': height,
+ })
+
self._sort_formats(formats)
+ thumbnail = self._proto_relative_url(
+ xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
+ thumbnails = self._extract_thumbnails(cfg_xml)
+
+ title = self._html_search_regex(
+ self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
+
+ age_limit = self._rta_search(webpage)
+
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', default=None))
+
+ def extract_field(pattern, name):
+ return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None
+
+ description = extract_field(self._DESCRIPTION_REGEX, 'description')
+ uploader = extract_field(self._UPLOADER_REGEX, 'uploader')
+ view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count'))
+ comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count'))
+ average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
+
+ categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
+ categories = categories_str.split(', ') if categories_str is not None else []
+
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'duration': duration,
'age_limit': age_limit,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'average_rating': average_rating,
+ 'categories': categories,
'formats': formats,
}
+
+
+class TNAFlixIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
+
+ _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
+ _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
+ _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div'
+
+ _TESTS = [{
+ # anonymous uploader, no categories
+ 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+ 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+ 'info_dict': {
+ 'id': '553878',
+ 'display_id': 'Carmella-Decesare-striptease',
+ 'ext': 'mp4',
+ 'title': 'Carmella Decesare - striptease',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 91,
+ 'age_limit': 18,
+ 'uploader': 'Anonymous',
+ 'categories': [],
+ }
+ }, {
+ # non-anonymous uploader, categories
+ 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
+ 'md5': '0f5d4d490dbfd117b8607054248a07c0',
+ 'info_dict': {
+ 'id': '6538',
+ 'display_id': 'Educational-xxx-video',
+ 'ext': 'mp4',
+ 'title': 'Educational xxx video',
+ 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 164,
+ 'age_limit': 18,
+ 'uploader': 'bobwhite39',
+ 'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'],
+ }
+ }, {
+ 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+ 'only_matching': True,
+ }]
+
+
+class EMPFlixIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
+
+ _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>'
+
+ _TESTS = [{
+ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+ 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+ 'info_dict': {
+ 'id': '33051',
+ 'display_id': 'Amateur-Finger-Fuck',
+ 'ext': 'mp4',
+ 'title': 'Amateur Finger Fuck',
+ 'description': 'Amateur solo finger fucking.',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 83,
+ 'age_limit': 18,
+ 'uploader': 'cwbike',
+ 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'],
+ }
+ }, {
+ 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+ 'only_matching': True,
+ }]
+
+
+class MovieFapIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html'
+
+ _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>'
+ _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>'
+ _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>'
+ _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>'
+
+ _TESTS = [{
+ # normal, multi-format video
+ 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
+ 'md5': '26624b4e2523051b550067d547615906',
+ 'info_dict': {
+ 'id': 'be9867c9416c19f54a4a',
+ 'display_id': 'experienced-milf-amazing-handjob',
+ 'ext': 'mp4',
+ 'title': 'Experienced MILF Amazing Handjob',
+ 'description': 'Experienced MILF giving an Amazing Handjob',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ 'uploader': 'darvinfred06',
+ 'view_count': int,
+ 'comment_count': int,
+ 'average_rating': float,
+ 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'],
+ }
+ }, {
+ # quirky single-format case where the extension is given as fid, but the video is really an flv
+ 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
+ 'md5': 'fa56683e291fc80635907168a743c9ad',
+ 'info_dict': {
+ 'id': 'e5da0d3edce5404418f5',
+ 'display_id': 'jeune-couple-russe',
+ 'ext': 'flv',
+ 'title': 'Jeune Couple Russe',
+ 'description': 'Amateur',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ 'uploader': 'whiskeyjar',
+ 'view_count': int,
+ 'comment_count': int,
+ 'average_rating': float,
+ 'categories': ['Amateur', 'Teen'],
+ }
+ }]
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index d73ad3762..c9cb69333 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
webpage = self._download_webpage(req, display_id)
flashvars = json.loads(self._html_search_regex(
- r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+ r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'))
video_url = flashvars['video_url']
if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
thumbnail = flashvars.get('image_url')
title = self._html_search_regex(
- r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+ r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
description = self._html_search_regex(
- r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+ r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
uploader = self._html_search_regex(
- r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+ r'<span class="username">\s*(.+?)\s*<',
webpage, 'uploader', fatal=False)
like_count = int_or_none(self._html_search_regex(
- r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+ r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
dislike_count = int_or_none(self._html_search_regex(
- r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+ r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
view_count = self._html_search_regex(
- r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+ r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644
index 000000000..4f86b3ee9
--- /dev/null
+++ b/youtube_dl/extractor/tubitv.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+ _LOGIN_URL = 'http://tubitv.com/login'
+ _NETRC_MACHINE = 'tubitv'
+ _TEST = {
+ 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+ 'info_dict': {
+ 'id': '54411',
+ 'ext': 'mp4',
+ 'title': 'The Kitchen Musical - EP01',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'description': 'md5:37532716166069b353e8866e71fefae7',
+ 'duration': 2407,
+ },
+ 'params': {
+ 'skip_download': 'HLS download',
+ },
+ }
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ form_data = {
+ 'username': username,
+ 'password': password,
+ }
+ payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+ request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ login_page = self._download_webpage(
+ request, None, False, 'Wrong login info')
+ if not re.search(r'id="tubi-logout"', login_page):
+ raise ExtractorError(
+ 'Login failed (invalid username/password)', expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+ self.raise_login_required('This video requires login')
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+ m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index c89de5ba4..84fe71aef 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -29,6 +29,8 @@ class TudouIE(InfoExtractor):
}
}]
+ _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
+
def _url_for_id(self, id, quality=None):
info_url = "http://v2.tudou.com/f?id=" + str(id)
if quality:
@@ -54,6 +56,10 @@ class TudouIE(InfoExtractor):
thumbnail_url = self._search_regex(
r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False)
+ player_url = self._search_regex(
+ r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']",
+ webpage, 'player URL', default=self._PLAYER_URL)
+
segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
segments = json.loads(segs_json)
# It looks like the keys are the arguments that have to be passed as
@@ -76,6 +82,9 @@ class TudouIE(InfoExtractor):
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
+ 'http_headers': {
+ 'Referer': player_url,
+ },
}
result.append(part_info)
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 828c808a6..3d3b635e4 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -28,6 +28,28 @@ class TumblrIE(InfoExtractor):
'description': 'md5:dba62ac8639482759c8eb10ce474586a',
'thumbnail': 're:http://.*\.jpg',
}
+ }, {
+ 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+ 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+ 'info_dict': {
+ 'id': 'Wmur',
+ 'ext': 'mp4',
+ 'title': 'naked smoking & stretching',
+ 'upload_date': '20150506',
+ 'timestamp': 1430931613,
+ },
+ 'add_ie': ['Vidme'],
+ }, {
+ 'url': 'http://camdamage.tumblr.com/post/98846056295/',
+ 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+ 'info_dict': {
+ 'id': '105463834',
+ 'ext': 'mp4',
+ 'title': 'Cam Damage-HD 720p',
+ 'uploader': 'John Moyer',
+ 'uploader_id': 'user32021558',
+ },
+ 'add_ie': ['Vimeo'],
}]
def _real_extract(self, url):
@@ -36,12 +58,16 @@ class TumblrIE(InfoExtractor):
blog = m_url.group('blog_name')
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
- webpage = self._download_webpage(url, video_id)
+ webpage, urlh = self._download_webpage_handle(url, video_id)
iframe_url = self._search_regex(
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
- webpage, 'iframe url')
- iframe = self._download_webpage(iframe_url, video_id)
+ webpage, 'iframe url', default=None)
+ if iframe_url is None:
+ return self.url_result(urlh.geturl(), 'Generic')
+
+ iframe = self._download_webpage(iframe_url, video_id,
+ 'Downloading iframe page')
video_url = self._search_regex(r'<source src="([^"]+)"',
iframe, 'video url')
diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py
index 29703a8a9..7ae63a499 100644
--- a/youtube_dl/extractor/turbo.py
+++ b/youtube_dl/extractor/turbo.py
@@ -23,7 +23,7 @@ class TurboIE(InfoExtractor):
'ext': 'mp4',
'duration': 3715,
'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
- 'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+ 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
'thumbnail': 're:^https?://.*\.jpg$',
}
}
@@ -42,7 +42,7 @@ class TurboIE(InfoExtractor):
title = xpath_text(item, './title', 'title')
duration = int_or_none(xpath_text(item, './durate', 'duration'))
thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
- description = self._og_search_description(webpage)
+ description = self._html_search_meta('description', webpage)
formats = []
get_quality = qualities(['3g', 'sd', 'hq'])
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index 4de0aac52..fad720b68 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
data_content = self._download_webpage(
'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
- video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+ video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
return {
'id': internal_id,
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
new file mode 100644
index 000000000..fa338b936
--- /dev/null
+++ b/youtube_dl/extractor/tv2.py
@@ -0,0 +1,126 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ float_or_none,
+ parse_iso8601,
+ remove_end,
+)
+
+
+class TV2IE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.tv2.no/v/916509/',
+ 'md5': '9cb9e3410b18b515d71892f27856e9b1',
+ 'info_dict': {
+ 'id': '916509',
+ 'ext': 'flv',
+ 'title': 'Se Gryttens hyllest av Steven Gerrard',
+ 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
+ 'timestamp': 1431715610,
+ 'upload_date': '20150515',
+ 'duration': 156.967,
+ 'view_count': int,
+ 'categories': list,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats = []
+ format_urls = []
+ for protocol in ('HDS', 'HLS'):
+ data = self._download_json(
+ 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol),
+ video_id, 'Downloading play JSON')['playback']
+ for item in data['items']['item']:
+ video_url = item.get('url')
+ if not video_url or video_url in format_urls:
+ continue
+ format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+ if not self._is_valid_url(video_url, video_id, format_id):
+ continue
+ format_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id=format_id))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=format_id))
+ elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+ pass
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'tbr': int_or_none(item.get('bitrate')),
+ 'filesize': int_or_none(item.get('fileSize')),
+ })
+ self._sort_formats(formats)
+
+ asset = self._download_json(
+ 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id,
+ video_id, 'Downloading metadata JSON')['asset']
+
+ title = asset['title']
+ description = asset.get('description')
+ timestamp = parse_iso8601(asset.get('createTime'))
+ duration = float_or_none(asset.get('accurateDuration') or asset.get('duration'))
+ view_count = int_or_none(asset.get('views'))
+ categories = asset.get('keywords', '').split(',')
+
+ thumbnails = [{
+ 'id': thumbnail.get('@type'),
+ 'url': thumbnail.get('url'),
+ } for _, thumbnail in asset.get('imageVersions', {}).items()]
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'categories': categories,
+ 'formats': formats,
+ }
+
+
+class TV2ArticleIE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
+ 'info_dict': {
+ 'id': '6930542',
+ 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret',
+ 'description': 'md5:339573779d3eea3542ffe12006190954',
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'http://www.tv2.no/a/6930542',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2')
+ for video_id in re.findall(r'data-assetid="(\d+)"', webpage)]
+
+ title = remove_end(self._og_search_title(webpage), ' - TV2.no')
+ description = remove_end(self._og_search_description(webpage), ' - TV2.no')
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
new file mode 100644
index 000000000..3a4f393fc
--- /dev/null
+++ b/youtube_dl/extractor/tvc.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+ 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+ 'info_dict': {
+ 'id': '74622',
+ 'ext': 'mp4',
+ 'title': 'События. "События". Эфир от 22.05.2015 14:30',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1122,
+ },
+ }
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+ formats = []
+ for info in video.get('path', {}).get('quality', []):
+ video_url = info.get('url')
+ if not video_url:
+ continue
+ format_id = self._search_regex(
+ r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+ 'format id', default=None)
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'width': int_or_none(info.get('width')),
+ 'height': int_or_none(info.get('height')),
+ 'tbr': int_or_none(info.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video['title'],
+ 'thumbnail': video.get('picture'),
+ 'duration': int_or_none(video.get('duration')),
+ 'formats': formats,
+ }
+
+
+class TVCArticleIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+ 'info_dict': {
+ 'id': '74622',
+ 'ext': 'mp4',
+ 'title': 'События. "События". Эфир от 22.05.2015 14:30',
+ 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1122,
+ },
+ }, {
+ 'url': 'http://www.tvc.ru/news/show/id/69944',
+ 'info_dict': {
+ 'id': '75399',
+ 'ext': 'mp4',
+ 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+ 'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 278,
+ },
+ }, {
+ 'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+ 'info_dict': {
+ 'id': '2185',
+ 'ext': 'mp4',
+ 'title': 'Ещё не поздно. Эфир от 03.08.2013',
+ 'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 3316,
+ },
+ }]
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, self._match_id(url))
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'TVC',
+ 'url': self._og_search_video_url(webpage),
+ 'title': clean_html(self._og_search_title(webpage)),
+ 'description': clean_html(self._og_search_description(webpage)),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index 102362b29..dc3a8334a 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -5,7 +5,9 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
float_or_none,
+ int_or_none,
parse_age_limit,
)
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
'display_id': 'sokrat',
'ext': 'flv',
'title': 'Сократ',
- 'description': 'md5:a05bd01be310074d5833efc6743be95e',
+ 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
'duration': 6586,
- 'age_limit': 0,
+ 'age_limit': 12,
},
+ 'skip': 'georestricted',
},
{
'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
- 'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+ 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '5142516',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'duration': 186.080,
'age_limit': 0,
},
+ 'skip': 'georestricted',
}, {
'url': 'https://cloud.tvigle.ru/video/5267604/',
'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
- r'<li class="video-preview current_playing" id="(\d+)">',
+ r'class="video-preview current_playing" id="(\d+)">',
webpage, 'video id')
video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
item = video_data['playlist']['items'][0]
+ videos = item.get('videos')
+
+ error_message = item.get('errorMessage')
+ if not videos and error_message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
title = item['title']
- description = item['description']
- thumbnail = item['thumbnail']
+ description = item.get('description')
+ thumbnail = item.get('thumbnail')
duration = float_or_none(item.get('durationMilliseconds'), 1000)
age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
for vcodec, fmts in item['videos'].items():
- for quality, video_url in fmts.items():
+ for format_id, video_url in fmts.items():
+ if format_id == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=vcodec))
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
formats.append({
'url': video_url,
- 'format_id': '%s-%s' % (vcodec, quality),
+ 'format_id': '%s-%s' % (vcodec, format_id),
'vcodec': vcodec,
- 'height': int(quality[:-1]),
- 'filesize': item['video_files_size'][vcodec][quality],
+ 'height': int_or_none(height),
+ 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index e83e31a31..b4683de54 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor):
viasat4play\.no/programmer|
tv6play\.no/programmer|
tv3play\.dk/programmer|
+ play\.novatv\.bg/programi
)/[^/]+/(?P<id>\d+)
'''
_TESTS = [
@@ -103,6 +104,7 @@ class TVPlayIE(InfoExtractor):
'duration': 1492,
'timestamp': 1330522854,
'upload_date': '20120229',
+ 'age_limit': 18,
},
'params': {
# rtmp download
@@ -173,6 +175,22 @@ class TVPlayIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+ 'info_dict': {
+ 'id': '624952',
+ 'ext': 'flv',
+ 'title': 'Здравей, България (12.06.2015 г.) ',
+ 'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+ 'duration': 8838,
+ 'timestamp': 1434100372,
+ 'upload_date': '20150612',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py
index c80ec15cf..f3198fb85 100644
--- a/youtube_dl/extractor/tweakers.py
+++ b/youtube_dl/extractor/tweakers.py
@@ -1,19 +1,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- xpath_text,
- xpath_with_ns,
- int_or_none,
- float_or_none,
-)
class TweakersIE(InfoExtractor):
_VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
_TEST = {
'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
- 'md5': '1b5afa817403bb5baa08359dca31e6df',
+ 'md5': '3147e4ddad366f97476a93863e4557c8',
'info_dict': {
'id': '9926',
'ext': 'mp4',
@@ -25,41 +19,7 @@ class TweakersIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- playlist = self._download_xml(
- 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % video_id,
- video_id)
-
- NS_MAP = {
- 'xspf': 'http://xspf.org/ns/0/',
- 's1': 'http://static.streamone.nl/player/ns/0',
- }
-
- track = playlist.find(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP))
-
- title = xpath_text(
- track, xpath_with_ns('./xspf:title', NS_MAP), 'title')
- description = xpath_text(
- track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
- thumbnail = xpath_text(
- track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
- duration = float_or_none(
- xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'),
- 1000)
-
- formats = [{
- 'url': location.text,
- 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
- 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
- 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
- } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
+ playlist_id = self._match_id(url)
+ entries = self._extract_xspf_playlist(
+ 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id)
+ return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
index 67e8bfea0..c1ee1decc 100644
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.24video.net/video/view/1044982',
- 'md5': '48dd7646775690a80447a8dca6a2df76',
+ 'md5': 'd041af8b5b4246ea466226a0d6693345',
'info_dict': {
'id': '1044982',
'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
webpage, 'upload date'))
uploader = self._html_search_regex(
- r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+ r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
webpage, 'uploader', fatal=False)
view_count = int_or_none(self._html_search_regex(
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 94bd6345d..023911c41 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -7,12 +7,17 @@ import random
from .common import InfoExtractor
from ..compat import (
+ compat_parse_qs,
compat_str,
compat_urllib_parse,
+ compat_urllib_parse_urlparse,
compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
+ int_or_none,
+ parse_duration,
parse_iso8601,
)
@@ -22,8 +27,8 @@ class TwitchBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'http://usher.twitch.tv'
- _LOGIN_URL = 'https://secure.twitch.tv/user/login'
- _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login'
+ _LOGIN_URL = 'https://secure.twitch.tv/login'
+ _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new'
_NETRC_MACHINE = 'twitch'
def _handle_error(self, response):
@@ -59,32 +64,35 @@ class TwitchBaseIE(InfoExtractor):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
- authenticity_token = self._search_regex(
- r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
- login_page, 'authenticity token')
-
- login_form = {
- 'utf8': '✓'.encode('utf-8'),
- 'authenticity_token': authenticity_token,
- 'redirect_on_login': '',
- 'embed_form': 'false',
- 'mp_source_action': 'login-button',
- 'follow': '',
- 'login': username,
- 'password': password,
- }
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'login': username.encode('utf-8'),
+ 'password': password.encode('utf-8'),
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post url', default=self._LOGIN_POST_URL, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
request = compat_urllib_request.Request(
- self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
request.add_header('Referer', self._LOGIN_URL)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
- m = re.search(
- r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
- if m:
+ error_message = self._search_regex(
+ r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>',
+ response, 'error message', default=None)
+ if error_message:
raise ExtractorError(
- 'Unable to login: %s' % m.group('msg').strip(), expected=True)
+ 'Unable to login. Twitch said: %s' % error_message, expected=True)
+
+ if '>Reset your password<' in response:
+ self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit')
def _prefer_source(self, formats):
try:
@@ -133,14 +141,14 @@ class TwitchItemBaseIE(TwitchBaseIE):
def _extract_info(self, info):
return {
'id': info['_id'],
- 'title': info['title'],
- 'description': info['description'],
- 'duration': info['length'],
- 'thumbnail': info['preview'],
- 'uploader': info['channel']['display_name'],
- 'uploader_id': info['channel']['name'],
- 'timestamp': parse_iso8601(info['recorded_at']),
- 'view_count': info['views'],
+ 'title': info.get('title') or 'Untitled Broadcast',
+ 'description': info.get('description'),
+ 'duration': int_or_none(info.get('length')),
+ 'thumbnail': info.get('preview'),
+ 'uploader': info.get('channel', {}).get('display_name'),
+ 'uploader_id': info.get('channel', {}).get('name'),
+ 'timestamp': parse_iso8601(info.get('recorded_at')),
+ 'view_count': int_or_none(info.get('views')),
}
def _real_extract(self, url):
@@ -188,25 +196,45 @@ class TwitchVodIE(TwitchItemBaseIE):
_ITEM_TYPE = 'vod'
_ITEM_SHORTCUT = 'v'
- _TEST = {
- 'url': 'http://www.twitch.tv/ksptv/v/3622000',
+ _TESTS = [{
+ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
'info_dict': {
- 'id': 'v3622000',
+ 'id': 'v6528877',
'ext': 'mp4',
- 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
+ 'title': 'LCK Summer Split - Week 6 Day 1',
'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 6951,
- 'timestamp': 1419028564,
- 'upload_date': '20141219',
- 'uploader': 'KSPTV',
- 'uploader_id': 'ksptv',
+ 'duration': 17208,
+ 'timestamp': 1435131709,
+ 'upload_date': '20150624',
+ 'uploader': 'Riot Games',
+ 'uploader_id': 'riotgames',
'view_count': int,
+ 'start_time': 310,
},
'params': {
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ # Untitled broadcast (title is None)
+ 'url': 'http://www.twitch.tv/belkao_o/v/11230755',
+ 'info_dict': {
+ 'id': 'v11230755',
+ 'ext': 'mp4',
+ 'title': 'Untitled Broadcast',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1638,
+ 'timestamp': 1439746708,
+ 'upload_date': '20150816',
+ 'uploader': 'BelkAO_o',
+ 'uploader_id': 'belkao_o',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
item_id = self._match_id(url)
@@ -215,11 +243,17 @@ class TwitchVodIE(TwitchItemBaseIE):
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % self._ITEM_TYPE)
formats = self._extract_m3u8_formats(
- '%s/vod/%s?nauth=%s&nauthsig=%s'
+ '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
% (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
self._prefer_source(formats)
info['formats'] = formats
+
+ parsed_url = compat_urllib_parse_urlparse(url)
+ query = compat_parse_qs(parsed_url.query)
+ if 't' in query:
+ info['start_time'] = parse_duration(query['t'][0])
+
return info
@@ -314,9 +348,9 @@ class TwitchBookmarksIE(TwitchPlaylistBaseIE):
class TwitchStreamIE(TwitchBaseIE):
IE_NAME = 'twitch:stream'
- _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+ _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
- _TEST = {
+ _TESTS = [{
'url': 'http://www.twitch.tv/shroomztv',
'info_dict': {
'id': '12772022048',
@@ -335,7 +369,10 @@ class TwitchStreamIE(TwitchBaseIE):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.twitch.tv/miracle_doto#profile-0',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -350,6 +387,12 @@ class TwitchStreamIE(TwitchBaseIE):
'http://www.twitch.tv/%s/profile' % channel_id,
'TwitchProfile', channel_id)
+ # Channel name may be typed if different case than the original channel name
+ # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
+ # an invalid m3u8 URL. Working around by use of original channel name from stream
+ # JSON and fallback to lowercase if it's not available.
+ channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
+
access_token = self._download_json(
'%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
'Downloading channel access token')
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
new file mode 100644
index 000000000..1aaa06305
--- /dev/null
+++ b/youtube_dl/extractor/twitter.py
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import (
+ float_or_none,
+ unescapeHTML,
+)
+
+
+class TwitterCardIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+ 'md5': 'a74f50b310c83170319ba16de6955192',
+ 'info_dict': {
+ 'id': '560070183650213889',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 30.033,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ # Different formats served for different User-Agents
+ USER_AGENTS = [
+ 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4
+ 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm
+ ]
+
+ config = None
+ formats = []
+ for user_agent in USER_AGENTS:
+ request = compat_urllib_request.Request(url)
+ request.add_header('User-Agent', user_agent)
+ webpage = self._download_webpage(request, video_id)
+
+ config = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'data player config')),
+ video_id)
+
+ video_url = config['playlist'][0]['source']
+
+ f = {
+ 'url': video_url,
+ }
+
+ m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+ if m:
+ f.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ thumbnail = config.get('posterImageUrl')
+ duration = float_or_none(config.get('duration'))
+
+ return {
+ 'id': video_id,
+ 'title': 'TwitterCard',
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 4667ed83b..365d8b4bf 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -15,7 +15,8 @@ from ..utils import (
class UdemyIE(InfoExtractor):
IE_NAME = 'udemy'
_VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
- _LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
+ _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
+ _ORIGIN_URL = 'https://www.udemy.com'
_NETRC_MACHINE = 'udemy'
_TESTS = [{
@@ -69,34 +70,39 @@ class UdemyIE(InfoExtractor):
def _login(self):
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError(
- 'Udemy account is required, use --username and --password options to provide account credentials.',
- expected=True)
+ self.raise_login_required('Udemy account is required')
login_popup = self._download_webpage(
- 'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None,
- 'Downloading login popup')
+ self._LOGIN_URL, None, 'Downloading login popup')
+
+ def is_logged(webpage):
+ return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<'])
- if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
+ # already logged in
+ if is_logged(login_popup):
return
- csrf = self._html_search_regex(
- r'<input type="hidden" name="csrf" value="(.+?)"',
- login_popup, 'csrf token')
+ login_form = self._form_hidden_inputs('login-form', login_popup)
+
+ login_form.update({
+ 'email': username.encode('utf-8'),
+ 'password': password.encode('utf-8'),
+ })
- login_form = {
- 'email': username,
- 'password': password,
- 'csrf': csrf,
- 'displayType': 'json',
- 'isSubmitted': '1',
- }
request = compat_urllib_request.Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- response = self._download_json(
+ request.add_header('Referer', self._ORIGIN_URL)
+ request.add_header('Origin', self._ORIGIN_URL)
+
+ response = self._download_webpage(
request, None, 'Logging in as %s' % username)
- if 'returnUrl' not in response:
+ if not is_logged(response):
+ error = self._html_search_regex(
+ r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py
index c08428acf..2151f8338 100644
--- a/youtube_dl/extractor/udn.py
+++ b/youtube_dl/extractor/udn.py
@@ -11,6 +11,7 @@ from ..compat import compat_urlparse
class UDNEmbedIE(InfoExtractor):
+ IE_DESC = '聯合影音'
_VALID_URL = r'https?://video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
_TESTS = [{
'url': 'http://video.udn.com/embed/news/300040',
diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py
index 96c809eaf..c4751050e 100644
--- a/youtube_dl/extractor/ultimedia.py
+++ b/youtube_dl/extractor/ultimedia.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
qualities,
@@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- deliver_url = self._search_regex(
- r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
- webpage, 'deliver URL')
+ deliver_url = self._proto_relative_url(self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+ webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
deliver_page = self._download_webpage(
deliver_url, video_id, 'Downloading iframe page')
@@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor):
player = self._parse_json(
self._search_regex(
- r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'),
+ r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
+ deliver_page, 'player'),
video_id)
quality = qualities(['flash', 'html5'])
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index dd026748d..722eb5236 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
@@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- redirect_page, urlh = self._download_webpage_handle(url, video_id)
- new_location = self._search_regex(r'window\.location = \'(.*)\';',
- redirect_page, 'redirect location')
- redirect_url = urlh.geturl() + new_location
- webpage = self._download_webpage(redirect_url, video_id,
+ # need to get the page 3 times for the correct jsSecretToken cookie
+ # which is necessary for the correct title
+ def get_session_id():
+ redirect_page = self._download_webpage(url, video_id)
+ session_id_url = self._search_regex(
+ r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
+ 'session id url')
+ self._download_webpage(
+ compat_urlparse.urljoin(url, session_id_url), video_id,
+ 'Getting session id')
+
+ get_session_id()
+ get_session_id()
+
+ webpage = self._download_webpage(url, video_id,
'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>',
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 346edf485..0d8d832cc 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -5,6 +5,7 @@ import json
from .common import InfoExtractor
from ..compat import (
+ compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import (
@@ -76,7 +77,7 @@ class VeeHDIE(InfoExtractor):
if config_json:
config = json.loads(config_json)
- video_url = compat_urlparse.unquote(config['clip']['url'])
+ video_url = compat_urllib_parse_unquote(config['clip']['url'])
if not video_url:
video_url = self._html_search_regex(
diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py
index 6215f0642..3c8d2a943 100644
--- a/youtube_dl/extractor/vessel.py
+++ b/youtube_dl/extractor/vessel.py
@@ -38,9 +38,13 @@ class VesselIE(InfoExtractor):
return req
@staticmethod
- def find_assets(data, asset_type):
+ def find_assets(data, asset_type, asset_id=None):
for asset in data.get('assets', []):
- if asset.get('type') == asset_type:
+ if not asset.get('type') == asset_type:
+ continue
+ elif asset_id is not None and not asset.get('id') == asset_id:
+ continue
+ else:
yield asset
def _check_access_rights(self, data):
@@ -82,11 +86,13 @@ class VesselIE(InfoExtractor):
req = VesselIE.make_json_request(
self._API_URL_TEMPLATE % asset_id, {'client': 'web'})
data = self._download_json(req, video_id)
+ video_asset_id = data.get('main_video_asset')
self._check_access_rights(data)
try:
- video_asset = next(VesselIE.find_assets(data, 'video'))
+ video_asset = next(
+ VesselIE.find_assets(data, 'video', asset_id=video_asset_id))
except StopIteration:
raise ExtractorError('No video assets found')
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 69dc9a759..f38a72fde 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -4,11 +4,26 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+)
class VGTVIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?vgtv\.no/#!/[^/]+/(?P<id>[0-9]+)'
+ IE_DESC = 'VGTV and BTTV'
+ _VALID_URL = r'''(?x)
+ (?:
+ vgtv:|
+ http://(?:www\.)?
+ )
+ (?P<host>vgtv|bt)
+ (?:
+ :|
+ \.no/(?:tv/)?\#!/(?:video|live)/
+ )
+ (?P<id>[0-9]+)
+ '''
_TESTS = [
{
# streamType: vod
@@ -47,16 +62,16 @@ class VGTVIE(InfoExtractor):
},
{
# streamType: live
- 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+ 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
'info_dict': {
- 'id': '100015',
+ 'id': '113063',
'ext': 'flv',
- 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
- 'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+ 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:b3743425765355855f88e096acc93231',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 0,
- 'timestamp': 1407423348,
- 'upload_date': '20140807',
+ 'timestamp': 1432975582,
+ 'upload_date': '20150530',
'view_count': int,
},
'params': {
@@ -64,25 +79,47 @@ class VGTVIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
+
+ HOST_WEBSITES = {
+ 'vgtv': 'vgtv',
+ 'bt': 'bttv',
+ }
+
data = self._download_json(
- 'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id,
+ 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website'
+ % (host, video_id, HOST_WEBSITES[host]),
video_id, 'Downloading media JSON')
+ if data.get('status') == 'inactive':
+ raise ExtractorError(
+ 'Video %s is no longer available' % video_id, expected=True)
+
streams = data['streamUrls']
+ stream_type = data.get('streamType')
formats = []
hls_url = streams.get('hls')
if hls_url:
- formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, 'mp4', m3u8_id='hls'))
hds_url = streams.get('hds')
- if hds_url:
- formats.extend(self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id))
+ # wasLive hds are always 404
+ if hds_url and stream_type != 'wasLive':
+ formats.extend(self._extract_f4m_formats(
+ hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+ video_id, f4m_id='hds'))
mp4_url = streams.get('mp4')
if mp4_url:
@@ -107,11 +144,60 @@ class VGTVIE(InfoExtractor):
return {
'id': video_id,
- 'title': data['title'],
+ 'title': self._live_title(data['title']),
'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'],
'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'],
'formats': formats,
+ 'is_live': True if stream_type == 'live' else False,
}
+
+
+class BTArticleIE(InfoExtractor):
+ IE_NAME = 'bt:article'
+ IE_DESC = 'Bergens Tidende Articles'
+ _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
+ _TEST = {
+ 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html',
+ 'md5': 'd055e8ee918ef2844745fcfd1a4175fb',
+ 'info_dict': {
+ 'id': '23199',
+ 'ext': 'mp4',
+ 'title': 'Alrekstad internat',
+ 'description': 'md5:dc81a9056c874fedb62fc48a300dac58',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 191,
+ 'timestamp': 1289991323,
+ 'upload_date': '20101117',
+ 'view_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, self._match_id(url))
+ video_id = self._search_regex(
+ r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id')
+ return self.url_result('vgtv:bt:%s' % video_id, 'VGTV')
+
+
+class BTVestlendingenIE(InfoExtractor):
+ IE_NAME = 'bt:vestlendingen'
+ IE_DESC = 'Bergens Tidende - Vestlendingen'
+ _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',
+ 'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+ 'info_dict': {
+ 'id': '86588',
+ 'ext': 'mov',
+ 'title': 'Otto Wollertsen',
+ 'description': 'Vestlendingen Otto Fredrik Wollertsen',
+ 'timestamp': 1430473209,
+ 'upload_date': '20150501',
+ },
+ }
+
+ def _real_extract(self, url):
+ return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream')
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 71f520fb5..01af7a995 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -1,5 +1,4 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
@@ -7,31 +6,34 @@ from ..utils import ExtractorError
class ViceIE(InfoExtractor):
- _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
- _TEST = {
- 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- 'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
- 'ext': 'mp4',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- },
- 'params': {
- # Requires ffmpeg (m3u8 manifest)
- 'skip_download': True,
- },
- }
+ _TESTS = [
+ {
+ 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'only_matching': True,
+ }
+ ]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- webpage = self._download_webpage(url, name)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
try:
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
'ooyala embed code')
ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
- print(ooyala_url)
except ExtractorError:
raise ExtractorError('The page doesn\'t contain a video', expected=True)
return self.url_result(ooyala_url, ie='Ooyala')
diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py
deleted file mode 100644
index 0eb3d9414..000000000
--- a/youtube_dl/extractor/videobam.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-import json
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class VideoBamIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
-
- _TESTS = [
- {
- 'url': 'http://videobam.com/OiJQM',
- 'md5': 'db471f27763a531f10416a0c58b5a1e0',
- 'info_dict': {
- 'id': 'OiJQM',
- 'ext': 'mp4',
- 'title': 'Is Alcohol Worse Than Ecstasy?',
- 'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
- 'uploader': 'frihetsvinge',
- },
- },
- {
- 'url': 'http://videobam.com/pqLvq',
- 'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
- 'note': 'HD video',
- 'info_dict': {
- 'id': 'pqLvq',
- 'ext': 'mp4',
- 'title': '_',
- }
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
-
- formats = []
-
- for preference, format_id in enumerate(['low', 'high']):
- mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
- if not mobj:
- continue
- formats.append({
- 'url': mobj.group('url'),
- 'ext': 'mp4',
- 'format_id': format_id,
- 'preference': preference,
- })
-
- if not formats:
- player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
- formats = [{
- 'url': item['url'],
- 'ext': 'mp4',
- } for item in player_config['playlist'] if 'autoPlay' in item]
-
- self._sort_formats(formats)
-
- title = self._og_search_title(page, default='_', fatal=False)
- description = self._og_search_description(page, default=None)
- thumbnail = self._og_search_thumbnail(page)
- uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
- view_count = int_or_none(
- self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'view_count': view_count,
- 'formats': formats,
- 'age_limit': 18,
- }
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
index d6a7eb203..ef2da5632 100644
--- a/youtube_dl/extractor/videolecturesnet.py
+++ b/youtube_dl/extractor/videolecturesnet.py
@@ -12,7 +12,7 @@ from ..utils import (
class VideoLecturesNetIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
+ _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$'
IE_NAME = 'videolectures.net'
_TEST = {
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index eb309a7cd..78ff6310a 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -8,20 +8,23 @@ from ..compat import compat_urllib_request
class VideoMegaIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:www\.)?videomega\.tv/
- (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
- '''
- _TEST = {
- 'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',
- 'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+ _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
+ _TESTS = [{
+ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
+ 'md5': 'cc1920a58add3f05c6a93285b84fb3aa',
'info_dict': {
- 'id': '4GNA688SU99US886ANG4',
+ 'id': 'AOSQBJYKIDDIKYJBQSOA',
'ext': 'mp4',
- 'title': 'BigBuckBunny_320x180',
+ 'title': '1254207',
'thumbnail': 're:^https?://.*\.jpg$',
}
- }
+ }, {
+ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomega.tv/view.php?ref=090051111052065112106089103052052103089106112065052111051090',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -29,12 +32,13 @@ class VideoMegaIE(InfoExtractor):
iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id
req = compat_urllib_request.Request(iframe_url)
req.add_header('Referer', url)
+ req.add_header('Cookie', 'noadvtday=0')
webpage = self._download_webpage(req, video_id)
title = self._html_search_regex(
- r'<title>(.*?)</title>', webpage, 'title')
+ r'<title>(.+?)</title>', webpage, 'title')
title = re.sub(
- r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)
+ r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
thumbnail = self._search_regex(
r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
video_url = self._search_regex(
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index ececc7ee0..591024ead 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
formats = [
{
- 'url': base64.b64decode(res['u']).decode('utf-8'),
+ 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
'ext': 'flv',
'format_id': res['l'],
} for res in settings['res'] if res['u']
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index bd953fb4c..157bb74fe 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -10,7 +10,7 @@ from ..utils import (
class VidmeIE(InfoExtractor):
_VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://vid.me/QNB',
'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
'info_dict': {
@@ -22,10 +22,36 @@ class VidmeIE(InfoExtractor):
'timestamp': 1406313244,
'upload_date': '20140725',
'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
},
- }
+ }, {
+ # tests uploader field
+ 'url': 'https://vid.me/4Iib',
+ 'info_dict': {
+ 'id': '4Iib',
+ 'ext': 'mp4',
+ 'title': 'The Carver',
+ 'description': 'md5:e9c24870018ae8113be936645b93ba3c',
+ 'duration': 97.859999999999999,
+ 'timestamp': 1433203629,
+ 'upload_date': '20150602',
+ 'uploader': 'Thomas',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+ 'url': 'https://vid.me/e/Wmur',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
+ url = url.replace('vid.me/e/', 'vid.me/')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -35,16 +61,23 @@ class VidmeIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._og_search_description(webpage, default='')
thumbnail = self._og_search_thumbnail(webpage)
- timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False))
- width = int_or_none(self._og_search_property('video:width', webpage, fatal=False))
- height = int_or_none(self._og_search_property('video:height', webpage, fatal=False))
+ timestamp = int_or_none(self._og_search_property(
+ 'updated_time', webpage, fatal=False))
+ width = int_or_none(self._og_search_property(
+ 'video:width', webpage, fatal=False))
+ height = int_or_none(self._og_search_property(
+ 'video:height', webpage, fatal=False))
duration = float_or_none(self._html_search_regex(
r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))
view_count = str_to_int(self._html_search_regex(
- r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
+ r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?',
+ webpage, 'view count', fatal=False))
like_count = str_to_int(self._html_search_regex(
r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',
webpage, 'like count', fatal=False))
+ uploader = self._html_search_regex(
+ 'class="video_author_username"[^>]*>([^<]+)',
+ webpage, 'uploader', default=None)
return {
'id': video_id,
@@ -58,4 +91,5 @@ class VidmeIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'like_count': like_count,
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 619039e51..15377097e 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -38,11 +38,14 @@ class VierIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
- r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+ [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
+ webpage, 'video id')
application = self._search_regex(
- r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+ [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+ webpage, 'application', default='vier_vod')
filename = self._search_regex(
- r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+ [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+ webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index 1742e66f4..cda02ba24 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -1,129 +1,142 @@
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_request
+from ..compat import (
+ compat_urllib_request,
+ compat_urllib_parse,
+ compat_urllib_parse_unquote,
+)
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ HEADRequest,
+)
class ViewsterIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P<id>\d+-\d+-\d+)'
+ _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'
_TESTS = [{
- # movielink, paymethod=fre
- 'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/',
- 'playlist': [{
- 'md5': '8f9d94b282d80c42b378dffdbb11caf3',
- 'info_dict': {
- 'id': '1293-19341-000-movie',
- 'ext': 'flv',
- 'title': "'Hout' (Wood) - Movie",
- },
- }],
- 'info_dict': {
- 'id': '1293-19341-000',
- 'title': "'Hout' (Wood)",
- 'description': 'md5:925733185a9242ef96f436937683f33b',
- }
- }, {
- # movielink, paymethod=adv
+ # movie, Type=Movie
'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/',
- 'playlist': [{
- 'md5': '77a005453ca7396cbe3d35c9bea30aef',
- 'info_dict': {
- 'id': '1140-11855-000-movie',
- 'ext': 'flv',
- 'title': "THE LISTENING PROJECT - Movie",
- },
- }],
+ 'md5': '14d3cfffe66d57b41ae2d9c873416f01',
'info_dict': {
'id': '1140-11855-000',
- 'title': "THE LISTENING PROJECT",
- 'description': 'md5:714421ae9957e112e672551094bf3b08',
- }
+ 'ext': 'flv',
+ 'title': 'The listening Project',
+ 'description': 'md5:bac720244afd1a8ea279864e67baa071',
+ 'timestamp': 1214870400,
+ 'upload_date': '20080701',
+ 'duration': 4680,
+ },
}, {
- # direct links, no movielink
- 'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/',
- 'playlist': [{
- 'md5': '0307b7eac6bfb21ab0577a71f6eebd8f',
- 'info_dict': {
- 'id': '1198-56411-000-trailer',
- 'ext': 'mp4',
- 'title': "Sinister - Trailer",
- },
- }, {
- 'md5': '80b9ee3ad69fb368f104cb5d9732ae95',
- 'info_dict': {
- 'id': '1198-56411-000-behind-scenes',
- 'ext': 'mp4',
- 'title': "Sinister - Behind Scenes",
- },
- }, {
- 'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5',
- 'info_dict': {
- 'id': '1198-56411-000-scene-from-movie',
- 'ext': 'mp4',
- 'title': "Sinister - Scene from movie",
- },
- }],
+ # series episode, Type=Episode
+ 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/',
+ 'md5': 'd5434c80fcfdb61651cc2199a88d6ba3',
'info_dict': {
- 'id': '1198-56411-000',
- 'title': "Sinister",
- 'description': 'md5:014c40b0488848de9683566a42e33372',
- }
+ 'id': '1284-19427-001',
+ 'ext': 'flv',
+ 'title': 'The World and a Wall',
+ 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3',
+ 'timestamp': 1428192000,
+ 'upload_date': '20150405',
+ 'duration': 1500,
+ },
+ }, {
+ # serie, Type=Serie
+ 'url': 'http://www.viewster.com/serie/1303-19426-000/',
+ 'info_dict': {
+ 'id': '1303-19426-000',
+ 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?',
+ 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11',
+ },
+ 'playlist_count': 13,
+ }, {
+ # unfinished serie, no Type
+ 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/',
+ 'info_dict': {
+ 'id': '1284-19427-000',
+ 'title': 'Baby Steps—Season 2',
+ 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1',
+ },
+ 'playlist_mincount': 16,
}]
_ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- request = compat_urllib_request.Request(
- 'http://api.live.viewster.com/api/v1/movie/%s' % video_id)
+ def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
+ request = compat_urllib_request.Request(url)
request.add_header('Accept', self._ACCEPT_HEADER)
+ request.add_header('Auth-token', self._AUTH_TOKEN)
+ return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal)
- movie = self._download_json(
- request, video_id, 'Downloading movie metadata JSON')
-
- title = movie.get('title') or movie['original_title']
- description = movie.get('synopsis')
- thumbnail = movie.get('large_artwork') or movie.get('artwork')
-
- entries = []
- for clip in movie['play_list']:
- entry = None
-
- # movielink api
- link_request = clip.get('link_request')
- if link_request:
- request = compat_urllib_request.Request(
- 'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s&currency=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s'
- % link_request)
- request.add_header('Accept', self._ACCEPT_HEADER)
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ # Get 'api_token' cookie
+ self._request_webpage(HEADRequest(url), video_id)
+ cookies = self._get_cookies(url)
+ self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value)
- movie_link = self._download_json(
- request, video_id, 'Downloading movie link JSON', fatal=False)
+ info = self._download_json(
+ 'https://public-api.viewster.com/search/%s' % video_id,
+ video_id, 'Downloading entry JSON')
- if movie_link:
- formats = self._extract_f4m_formats(
- movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id)
- self._sort_formats(formats)
- entry = {
- 'formats': formats,
- }
+ entry_id = info.get('Id') or info['id']
- # direct link
- clip_url = clip.get('clip_data', {}).get('url')
- if clip_url:
- entry = {
- 'url': clip_url,
- 'ext': 'mp4',
- }
+ # unfinished serie has no Type
+ if info.get('Type') in ['Serie', None]:
+ episodes = self._download_json(
+ 'https://public-api.viewster.com/series/%s/episodes' % entry_id,
+ video_id, 'Downloading series JSON')
+ entries = [
+ self.url_result(
+ 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster')
+ for episode in episodes]
+ title = (info.get('Title') or info['Synopsis']['Title']).strip()
+ description = info.get('Synopsis', {}).get('Detailed')
+ return self.playlist_result(entries, video_id, title, description)
- if entry:
- entry.update({
- 'id': '%s-%s' % (video_id, clip['canonical_title']),
- 'title': '%s - %s' % (title, clip['title']),
+ formats = []
+ for media_type in ('application/f4m+xml', 'application/x-mpegURL'):
+ media = self._download_json(
+ 'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
+ % (entry_id, compat_urllib_parse.quote(media_type)),
+ video_id, 'Downloading %s JSON' % media_type, fatal=False)
+ if not media:
+ continue
+ video_url = media.get('Uri')
+ if not video_url:
+ continue
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ video_url += '&' if '?' in video_url else '?'
+ video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls',
+ fatal=False # m3u8 sometimes fail
+ ))
+ else:
+ formats.append({
+ 'url': video_url,
})
- entries.append(entry)
+ self._sort_formats(formats)
- playlist = self.playlist_result(entries, video_id, title, description)
- playlist['thumbnail'] = thumbnail
- return playlist
+ synopsis = info.get('Synopsis', {})
+ # Prefer title outside synopsis since it's less messy
+ title = (info.get('Title') or synopsis['Title']).strip()
+ description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short')
+ duration = int_or_none(info.get('Duration'))
+ timestamp = parse_iso8601(info.get('ReleaseDate'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index cf6af1e5c..ddbd395c8 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -1,29 +1,105 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
+import json
+import time
+import hmac
+import hashlib
+import itertools
-from ..compat import (
- compat_urlparse,
- compat_urllib_request,
-)
from ..utils import (
ExtractorError,
- unescapeHTML,
- unified_strdate,
- US_RATINGS,
- determine_ext,
- mimetype2ext,
+ int_or_none,
+ parse_age_limit,
+ parse_iso8601,
)
+from ..compat import compat_urllib_request
from .common import InfoExtractor
-class VikiIE(InfoExtractor):
- IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+ _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+ _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+ _APP = '65535a'
+ _APP_VERSION = '2.2.5.1428709186'
+ _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+ _NETRC_MACHINE = 'viki'
+
+ _token = None
+
+ def _prepare_call(self, path, timestamp=None, post_data=None):
+ path += '?' if '?' not in path else '&'
+ if not timestamp:
+ timestamp = int(time.time())
+ query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+ if self._token:
+ query += '&token=%s' % self._token
+ sig = hmac.new(
+ self._APP_SECRET.encode('ascii'),
+ query.encode('ascii'),
+ hashlib.sha1
+ ).hexdigest()
+ url = self._API_URL_TEMPLATE % (query, sig)
+ return compat_urllib_request.Request(
+ url, json.dumps(post_data).encode('utf-8')) if post_data else url
+
+ def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
+ resp = self._download_json(
+ self._prepare_call(path, timestamp, post_data), video_id, note)
+
+ error = resp.get('error')
+ if error:
+ if error == 'invalid timestamp':
+ resp = self._download_json(
+ self._prepare_call(path, int(resp['current_timestamp']), post_data),
+ video_id, '%s (retry)' % note)
+ error = resp.get('error')
+ if error:
+ self._raise_error(resp['error'])
+
+ return resp
+
+ def _raise_error(self, error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error),
+ expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'login_id': username,
+ 'password': password,
+ }
+
+ login = self._call_api(
+ 'sessions.json', None,
+ 'Logging in as %s' % username, post_data=login_form)
+
+ self._token = login.get('token')
+ if not self._token:
+ self.report_warning('Unable to get session token, login has probably failed')
+
+ @staticmethod
+ def dict_selection(dict_obj, preferred_key):
+ if preferred_key in dict_obj:
+ return dict_obj.get(preferred_key)
- # iPad2
- _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'
+ filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
+ return filtered_dict[0] if filtered_dict else None
- _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+
+class VikiIE(VikiBaseIE):
+ IE_NAME = 'viki'
+ _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
_TESTS = [{
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': {
@@ -37,111 +113,224 @@ class VikiIE(InfoExtractor):
},
'skip': 'Blocked in the US',
}, {
+ # clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
- 'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
+ 'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
'info_dict': {
'id': '1067139v',
'ext': 'mp4',
+ 'title': "'The Avengers: Age of Ultron' Press Conference",
'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+ 'duration': 352,
+ 'timestamp': 1430380829,
'upload_date': '20150430',
- 'title': '\'The Avengers: Age of Ultron\' Press Conference',
+ 'uploader': 'Arirang TV',
+ 'like_count': int,
+ 'age_limit': 0,
}
}, {
'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
'info_dict': {
'id': '1048879v',
'ext': 'mp4',
- 'upload_date': '20140820',
- 'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
'title': 'Ankhon Dekhi',
+ 'duration': 6512,
+ 'timestamp': 1408532356,
+ 'upload_date': '20140820',
+ 'uploader': 'Spuul',
+ 'like_count': int,
+ 'age_limit': 13,
},
'params': {
- # requires ffmpeg
+ # m3u8 download
'skip_download': True,
}
+ }, {
+ # episode
+ 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+ 'md5': '190f3ef426005ba3a080a63325955bc3',
+ 'info_dict': {
+ 'id': '44699v',
+ 'ext': 'mp4',
+ 'title': 'Boys Over Flowers - Episode 1',
+ 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+ 'duration': 4155,
+ 'timestamp': 1270496524,
+ 'upload_date': '20100405',
+ 'uploader': 'group8',
+ 'like_count': int,
+ 'age_limit': 13,
+ }
+ }, {
+ # youtube external
+ 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+ 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+ 'info_dict': {
+ 'id': '50562v',
+ 'ext': 'mp4',
+ 'title': 'Poor Nastya [COMPLETE] - Episode 1',
+ 'description': '',
+ 'duration': 607,
+ 'timestamp': 1274949505,
+ 'upload_date': '20101213',
+ 'uploader': 'ad14065n',
+ 'uploader_id': 'ad14065n',
+ 'like_count': int,
+ 'age_limit': 13,
+ }
+ }, {
+ 'url': 'http://www.viki.com/player/44699v',
+ 'only_matching': True,
+ }, {
+ # non-English description
+ 'url': 'http://www.viki.com/videos/158036v-love-in-magic',
+ 'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+ 'info_dict': {
+ 'id': '158036v',
+ 'ext': 'mp4',
+ 'uploader': 'I Planet Entertainment',
+ 'upload_date': '20111122',
+ 'timestamp': 1321985454,
+ 'description': 'md5:44b1e46619df3a072294645c770cef36',
+ 'title': 'Love In Magic',
+ },
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
- uploader_m = re.search(
- r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
- if uploader_m is None:
- uploader = None
- else:
- uploader = uploader_m.group(1).strip()
-
- rating_str = self._html_search_regex(
- r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
- 'rating information', default='').strip()
- age_limit = US_RATINGS.get(rating_str)
-
- req = compat_urllib_request.Request(
- 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
- req.add_header('User-Agent', self._USER_AGENT)
- info_webpage = self._download_webpage(
- req, video_id, note='Downloading info page')
- err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
- if err_msg:
- if 'not available in your region' in err_msg:
- raise ExtractorError(
- 'Video %s is blocked from your location.' % video_id,
- expected=True)
- else:
- raise ExtractorError('Viki said: ' + err_msg)
- mobj = re.search(
- r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
- if not mobj:
- raise ExtractorError('Unable to find video URL')
- video_url = unescapeHTML(mobj.group('url'))
- video_ext = mimetype2ext(mobj.group('mime_type'))
-
- if determine_ext(video_url) == 'm3u8':
- formats = self._extract_m3u8_formats(
- video_url, video_id, ext=video_ext)
- else:
- formats = [{
- 'url': video_url,
- 'ext': video_ext,
- }]
-
- upload_date_str = self._html_search_regex(
- r'"created_at":"([^"]+)"', info_webpage, 'upload date')
- upload_date = (
- unified_strdate(upload_date_str)
- if upload_date_str is not None
- else None
- )
-
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, info_webpage)
-
- return {
+ video = self._call_api(
+ 'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+ title = self.dict_selection(video.get('titles', {}), 'en')
+ if not title:
+ title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+ container_titles = video.get('container', {}).get('titles', {})
+ container_title = self.dict_selection(container_titles, 'en')
+ title = '%s - %s' % (container_title, title)
+
+ description = self.dict_selection(video.get('descriptions', {}), 'en')
+
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('created_at'))
+ uploader = video.get('author')
+ like_count = int_or_none(video.get('likes', {}).get('count'))
+ age_limit = parse_age_limit(video.get('rating'))
+
+ thumbnails = []
+ for thumbnail_id, thumbnail in video.get('images', {}).items():
+ thumbnails.append({
+ 'id': thumbnail_id,
+ 'url': thumbnail.get('url'),
+ })
+
+ subtitles = {}
+ for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+ subtitles[subtitle_lang] = [{
+ 'ext': subtitles_format,
+ 'url': self._prepare_call(
+ 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+ } for subtitles_format in ('srt', 'vtt')]
+
+ result = {
'id': video_id,
'title': title,
- 'formats': formats,
'description': description,
- 'thumbnail': thumbnail,
- 'age_limit': age_limit,
+ 'duration': duration,
+ 'timestamp': timestamp,
'uploader': uploader,
- 'subtitles': video_subtitles,
- 'upload_date': upload_date,
+ 'like_count': like_count,
+ 'age_limit': age_limit,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
}
- def _get_subtitles(self, video_id, info_webpage):
- res = {}
- for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
- sturl = unescapeHTML(sturl_html)
- m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
- if not m:
- continue
- res[m.group('lang')] = [{
- 'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
- 'ext': 'vtt',
- }]
- return res
+ streams = self._call_api(
+ 'videos/%s/streams.json' % video_id, video_id,
+ 'Downloading video streams JSON')
+
+ if 'external' in streams:
+ result.update({
+ '_type': 'url_transparent',
+ 'url': streams['external']['url'],
+ })
+ return result
+
+ formats = []
+ for format_id, stream_dict in streams.items():
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ for protocol, format_dict in stream_dict.items():
+ if format_id == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+ else:
+ formats.append({
+ 'url': format_dict['url'],
+ 'format_id': '%s-%s' % (format_id, protocol),
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ result['formats'] = formats
+ return result
+
+
+class VikiChannelIE(VikiBaseIE):
+ IE_NAME = 'viki:channel'
+ _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+ _TESTS = [{
+ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+ 'info_dict': {
+ 'id': '50c',
+ 'title': 'Boys Over Flowers',
+ 'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+ },
+ 'playlist_count': 70,
+ }, {
+ 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+ 'info_dict': {
+ 'id': '1354c',
+ 'title': 'Poor Nastya [COMPLETE]',
+ 'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+ },
+ 'playlist_count': 127,
+ }, {
+ 'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.viki.com/artists/2141c-shinee',
+ 'only_matching': True,
+ }]
+
+ _PER_PAGE = 25
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ channel = self._call_api(
+ 'containers/%s.json' % channel_id, channel_id,
+ 'Downloading channel JSON')
+
+ title = self.dict_selection(channel['titles'], 'en')
+
+ description = self.dict_selection(channel['descriptions'], 'en')
+
+ entries = []
+ for video_type in ('episodes', 'clips', 'movies'):
+ for page_num in itertools.count(1):
+ page = self._call_api(
+ 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+ % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+ 'Downloading %s JSON page #%d' % (video_type, page_num))
+ for video in page['response']:
+ video_id = video['id']
+ entries.append(self.url_result(
+ 'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+ if not page['pagination']['next']:
+ break
+
+ return self.playlist_result(entries, channel_id, title, description)
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index f300c7ca4..50df79ca1 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -22,12 +22,14 @@ from ..utils import (
unified_strdate,
unsmuggle_url,
urlencode_postdata,
+ unescapeHTML,
)
class VimeoBaseInfoExtractor(InfoExtractor):
_NETRC_MACHINE = 'vimeo'
_LOGIN_REQUIRED = False
+ _LOGIN_URL = 'https://vimeo.com/log_in'
def _login(self):
(username, password) = self._get_login_info()
@@ -36,21 +38,25 @@ class VimeoBaseInfoExtractor(InfoExtractor):
raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return
self.report_login()
- login_url = 'https://vimeo.com/log_in'
- webpage = self._download_webpage(login_url, None, False)
- token = self._search_regex(r'xsrft":"(.*?)"', webpage, 'login token')
+ webpage = self._download_webpage(self._LOGIN_URL, None, False)
+ token = self._extract_xsrft(webpage)
data = urlencode_postdata({
+ 'action': 'login',
'email': username,
'password': password,
- 'action': 'login',
'service': 'vimeo',
'token': token,
})
- login_request = compat_urllib_request.Request(login_url, data)
+ login_request = compat_urllib_request.Request(self._LOGIN_URL, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- login_request.add_header('Cookie', 'xsrft=%s' % token)
+ login_request.add_header('Referer', self._LOGIN_URL)
self._download_webpage(login_request, None, False, 'Wrong login info')
+ def _extract_xsrft(self, webpage):
+ return self._search_regex(
+ r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
+ webpage, 'login token', group='xsrft')
+
class VimeoIE(VimeoBaseInfoExtractor):
"""Information extractor for vimeo.com."""
@@ -173,11 +179,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
]
+ @staticmethod
+ def _extract_vimeo_url(url, webpage):
+ # Look for embedded (iframe) Vimeo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
+ if mobj:
+ player_url = unescapeHTML(mobj.group('url'))
+ surl = smuggle_url(player_url, {'Referer': url})
+ return surl
+ # Look for embedded (swf embed) Vimeo player
+ mobj = re.search(
+ r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+ if mobj:
+ return mobj.group(1)
+
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
- token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token')
+ token = self._extract_xsrft(webpage)
data = urlencode_postdata({
'password': password,
'token': token,
@@ -187,7 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
url = url.replace('http://', 'https://')
password_request = compat_urllib_request.Request(url + '/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- password_request.add_header('Cookie', 'xsrft=%s' % token)
+ password_request.add_header('Referer', url)
return self._download_webpage(
password_request, video_id,
'Verifying the password', 'Wrong password')
@@ -406,10 +427,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
}
-class VimeoChannelIE(InfoExtractor):
+class VimeoChannelIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:channel'
_VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+ _TITLE = None
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
_TESTS = [{
'url': 'https://vimeo.com/channels/tributes',
@@ -424,7 +446,7 @@ class VimeoChannelIE(InfoExtractor):
return '%s/videos/page:%d/' % (base_url, pagenum)
def _extract_list_title(self, webpage):
- return self._html_search_regex(self._TITLE_RE, webpage, 'list title')
+ return self._TITLE or self._html_search_regex(self._TITLE_RE, webpage, 'list title')
def _login_list_password(self, page_url, list_id, webpage):
login_form = self._search_regex(
@@ -436,12 +458,8 @@ class VimeoChannelIE(InfoExtractor):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', login_form))
- token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token')
+ fields = self._hidden_inputs(login_form)
+ token = self._extract_xsrft(webpage)
fields['token'] = token
fields['password'] = password
post = urlencode_postdata(fields)
@@ -487,7 +505,7 @@ class VimeoChannelIE(InfoExtractor):
class VimeoUserIE(VimeoChannelIE):
IE_NAME = 'vimeo:user'
- _VALID_URL = r'https://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
+ _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
_TESTS = [{
'url': 'https://vimeo.com/nkistudio/videos',
@@ -591,14 +609,14 @@ class VimeoReviewIE(InfoExtractor):
return self.url_result(player_url, 'Vimeo', video_id)
-class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
+class VimeoWatchLaterIE(VimeoChannelIE):
IE_NAME = 'vimeo:watchlater'
IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
- _VALID_URL = r'https://vimeo\.com/home/watchlater|:vimeowatchlater'
+ _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater'
+ _TITLE = 'Watch Later'
_LOGIN_REQUIRED = True
- _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'
_TESTS = [{
- 'url': 'https://vimeo.com/home/watchlater',
+ 'url': 'https://vimeo.com/watchlater',
'only_matching': True,
}]
@@ -614,7 +632,7 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
return request
def _real_extract(self, url):
- return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
+ return self._extract_videos('watchlater', 'https://vimeo.com/watchlater')
class VimeoLikesIE(InfoExtractor):
diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py
index aa3d6ddfd..92321d66e 100644
--- a/youtube_dl/extractor/vimple.py
+++ b/youtube_dl/extractor/vimple.py
@@ -4,7 +4,29 @@ from .common import InfoExtractor
from ..utils import int_or_none
-class VimpleIE(InfoExtractor):
+class SprutoBaseIE(InfoExtractor):
+ def _extract_spruto(self, spruto, video_id):
+ playlist = spruto['playlist'][0]
+ title = playlist['title']
+ video_id = playlist.get('videoId') or video_id
+ thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
+ duration = int_or_none(playlist.get('duration'))
+
+ formats = [{
+ 'url': f['url'],
+ } for f in playlist['video']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
+
+
+class VimpleIE(SprutoBaseIE):
IE_DESC = 'Vimple - one-click video hosting'
_VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P<id>[\da-f-]{32,36})'
_TESTS = [
@@ -30,25 +52,9 @@ class VimpleIE(InfoExtractor):
webpage = self._download_webpage(
'http://player.vimple.ru/iframe/%s' % video_id, video_id)
- playlist = self._parse_json(
+ spruto = self._parse_json(
self._search_regex(
r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'),
- video_id)['playlist'][0]
-
- title = playlist['title']
- video_id = playlist.get('videoId') or video_id
- thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
- duration = int_or_none(playlist.get('duration'))
-
- formats = [{
- 'url': f['url'],
- } for f in playlist['video']]
- self._sort_formats(formats)
+ video_id)
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
+ return self._extract_spruto(spruto, video_id)
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index 65c459fad..c733a48fa 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -75,7 +75,7 @@ class VineIE(InfoExtractor):
return {
'id': video_id,
'title': self._og_search_title(webpage),
- 'alt_title': self._og_search_description(webpage),
+ 'alt_title': self._og_search_description(webpage, default=None),
'description': data['description'],
'thumbnail': data['thumbnailUrl'],
'upload_date': unified_strdate(data['created']),
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index cc384adbf..c30c5a8e5 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -13,14 +13,26 @@ from ..compat import (
from ..utils import (
ExtractorError,
orderedSet,
+ str_to_int,
unescapeHTML,
unified_strdate,
)
class VKIE(InfoExtractor):
- IE_NAME = 'vk.com'
- _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>[^s].*?)(?:\?|%2F|$))'
+ IE_NAME = 'vk'
+ IE_DESC = 'VK'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+ (?:
+ (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
+ (?:www\.)?biqle\.ru/watch/
+ )
+ (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+ )
+ '''
_NETRC_MACHINE = 'vk'
_TESTS = [
@@ -34,6 +46,7 @@ class VKIE(InfoExtractor):
'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'duration': 195,
'upload_date': '20120212',
+ 'view_count': int,
},
},
{
@@ -45,7 +58,8 @@ class VKIE(InfoExtractor):
'uploader': 'Tom Cruise',
'title': 'No name',
'duration': 9,
- 'upload_date': '20130721'
+ 'upload_date': '20130721',
+ 'view_count': int,
}
},
{
@@ -59,6 +73,7 @@ class VKIE(InfoExtractor):
'title': 'Lin Dan',
'duration': 101,
'upload_date': '20120730',
+ 'view_count': int,
}
},
{
@@ -73,7 +88,8 @@ class VKIE(InfoExtractor):
'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
'duration': 8352,
- 'upload_date': '20121218'
+ 'upload_date': '20121218',
+ 'view_count': int,
},
'skip': 'Requires vk account credentials',
},
@@ -100,14 +116,54 @@ class VKIE(InfoExtractor):
'title': 'Книга Илая',
'duration': 6771,
'upload_date': '20140626',
+ 'view_count': int,
},
'skip': 'Only works from Russia',
},
{
+ # video (removed?) only available with list id
+ 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
+ 'md5': '091287af5402239a1051c37ec7b92913',
+ 'info_dict': {
+ 'id': '171201961',
+ 'ext': 'mp4',
+ 'title': 'ТюменцевВВ_09.07.2015',
+ 'uploader': 'Anton Ivanov',
+ 'duration': 109,
+ 'upload_date': '20150709',
+ 'view_count': int,
+ },
+ },
+ {
+ # youtube embed
+ 'url': 'https://vk.com/video276849682_170681728',
+ 'info_dict': {
+ 'id': 'V3K4mi0SYkc',
+ 'ext': 'mp4',
+ 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
+ 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+ 'duration': 179,
+ 'upload_date': '20130116',
+ 'uploader': "Children's Joy Foundation",
+ 'uploader_id': 'thecjf',
+ 'view_count': int,
+ },
+ },
+ {
# removed video, just testing that we match the pattern
'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
'only_matching': True,
},
+ {
+ # age restricted video, requires vk account credentials
+ 'url': 'https://vk.com/video205387401_164765225',
+ 'only_matching': True,
+ },
+ {
+ # vk wrapper
+ 'url': 'http://www.biqle.ru/watch/847655_160197695',
+ 'only_matching': True,
+ }
]
def _login(self):
@@ -115,20 +171,25 @@ class VKIE(InfoExtractor):
if username is None:
return
- login_form = {
- 'act': 'login',
- 'role': 'al_frame',
- 'expire': '1',
- 'email': username,
- 'pass': password,
- }
+ login_page = self._download_webpage(
+ 'https://vk.com', None, 'Downloading login page')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'email': username.encode('cp1251'),
+ 'pass': password.encode('cp1251'),
+ })
- request = compat_urllib_request.Request('https://login.vk.com/?act=login',
- compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+ request = compat_urllib_request.Request(
+ 'https://login.vk.com/?act=login',
+ compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ login_page = self._download_webpage(
+ request, None, note='Logging in as %s' % username)
if re.search(r'onLoginFailed', login_page):
- raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+ raise ExtractorError(
+ 'Unable to login, incorrect username and/or password', expected=True)
def _real_initialize(self):
self._login()
@@ -140,9 +201,26 @@ class VKIE(InfoExtractor):
if not video_id:
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
- info_url = 'http://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+ info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+
+ # Some videos (removed?) can only be downloaded with list id specified
+ list_id = mobj.group('list_id')
+ if list_id:
+ info_url += '&list=%s' % list_id
+
info_page = self._download_webpage(info_url, video_id)
+ error_message = self._html_search_regex(
+ r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ info_page, 'error message', default=None)
+ if error_message:
+ raise ExtractorError(error_message, expected=True)
+
+ if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
+ raise ExtractorError(
+ 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
+ expected=True)
+
ERRORS = {
r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
'Video %s has been removed from public access due to rightholder complaint.',
@@ -156,16 +234,20 @@ class VKIE(InfoExtractor):
r'<!>Видео временно недоступно':
'Video %s is temporarily unavailable.',
+
+ r'<!>Access denied':
+ 'Access denied to video %s.',
}
for error_re, error_msg in ERRORS.items():
if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True)
- m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
- if m_yt is not None:
- self.to_screen('Youtube video detected')
- return self.url_result(m_yt.group(1), 'Youtube')
+ youtube_url = self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+ info_page, 'youtube iframe', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
@@ -175,25 +257,29 @@ class VKIE(InfoExtractor):
m_rutube.group(1).replace('\\', ''))
return self.url_result(rutube_url)
- m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
+ m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts:
- m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
+ m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
if m_opts_url:
opts_url = m_opts_url.group(1)
if opts_url.startswith('//'):
opts_url = 'http:' + opts_url
return self.url_result(opts_url)
- data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
+ data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')
data = json.loads(data_json)
# Extract upload date
upload_date = None
- mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+ mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
if mobj is not None:
mobj.group(1) + ' ' + mobj.group(2)
upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
+ view_count = str_to_int(self._search_regex(
+ r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
+ info_page, 'view count', fatal=False))
+
formats = [{
'format_id': k,
'url': v,
@@ -210,29 +296,39 @@ class VKIE(InfoExtractor):
'uploader': data.get('md_author'),
'duration': data.get('duration'),
'upload_date': upload_date,
+ 'view_count': view_count,
}
class VKUserVideosIE(InfoExtractor):
- IE_NAME = 'vk.com:user-videos'
- IE_DESC = 'vk.com:All of a user\'s videos'
- _VALID_URL = r'https?://vk\.com/videos(?P<id>[0-9]+)(?:m\?.*)?'
+ IE_NAME = 'vk:uservideos'
+ IE_DESC = "VK - User's Videos"
+ _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$'
_TEMPLATE_URL = 'https://vk.com/videos'
- _TEST = {
+ _TESTS = [{
'url': 'http://vk.com/videos205387401',
'info_dict': {
'id': '205387401',
+ 'title': "Tom Cruise's Videos",
},
'playlist_mincount': 4,
- }
+ }, {
+ 'url': 'http://vk.com/videos-77521',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
page_id = self._match_id(url)
- page = self._download_webpage(url, page_id)
- video_ids = orderedSet(
- m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page))
- url_entries = [
+
+ webpage = self._download_webpage(url, page_id)
+
+ entries = [
self.url_result(
'http://vk.com/video' + video_id, 'VK', video_id=video_id)
- for video_id in video_ids]
- return self.playlist_result(url_entries, page_id)
+ for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
+
+ title = unescapeHTML(self._search_regex(
+ r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
+ webpage, 'title', default=page_id))
+
+ return self.playlist_result(entries, page_id, title)
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py
index 1c0966a79..ccf1928b5 100644
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
@@ -28,12 +26,7 @@ class VodlockerIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- (?:id="[^"]+"\s+)?
- value="([^"]*)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
if fields['op'] == 'download1':
self._sleep(3, video_id) # they do detect when requests happen too fast!
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py
new file mode 100644
index 000000000..254383d6c
--- /dev/null
+++ b/youtube_dl/extractor/voicerepublic.py
@@ -0,0 +1,99 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ int_or_none,
+)
+
+
+class VoiceRepublicIE(InfoExtractor):
+ _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+ _TESTS = [{
+ 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+ 'md5': '0554a24d1657915aa8e8f84e15dc9353',
+ 'info_dict': {
+ 'id': '2296',
+ 'display_id': 'watching-the-watchers-building-a-sousveillance-state',
+ 'ext': 'm4a',
+ 'title': 'Watching the Watchers: Building a Sousveillance State',
+ 'description': 'md5:715ba964958afa2398df615809cfecb1',
+ 'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
+ 'duration': 1800,
+ 'view_count': int,
+ }
+ }, {
+ 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ req = compat_urllib_request.Request(
+ compat_urlparse.urljoin(url, '/talks/%s' % display_id))
+ # Older versions of Firefox get redirected to an "upgrade browser" page
+ req.add_header('User-Agent', 'youtube-dl')
+ webpage = self._download_webpage(req, display_id)
+
+ if '>Queued for processing, please stand by...<' in webpage:
+ raise ExtractorError(
+ 'Audio is still queued for processing', expected=True)
+
+ config = self._search_regex(
+ r'(?s)return ({.+?});\s*\n', webpage,
+ 'data', default=None)
+ data = self._parse_json(config, display_id, fatal=False) if config else None
+ if data:
+ title = data['title']
+ description = data.get('teaser')
+ talk_id = data.get('talk_id') or display_id
+ talk = data['talk']
+ duration = int_or_none(talk.get('duration'))
+ formats = [{
+ 'url': compat_urlparse.urljoin(url, talk_url),
+ 'format_id': format_id,
+ 'ext': determine_ext(talk_url) or format_id,
+ 'vcodec': 'none',
+ } for format_id, talk_url in talk['links'].items()]
+ else:
+ title = self._og_search_title(webpage)
+ description = self._html_search_regex(
+ r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>",
+ webpage, 'description', fatal=False)
+ talk_id = self._search_regex(
+ [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"],
+ webpage, 'talk id', default=None) or display_id
+ duration = None
+ player = self._search_regex(
+ r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player')
+ formats = [{
+ 'url': compat_urlparse.urljoin(url, talk_url),
+ 'format_id': format_id,
+ 'ext': determine_ext(talk_url) or format_id,
+ 'vcodec': 'none',
+ } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)]
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+ view_count = int_or_none(self._search_regex(
+ r"class='play-count[^']*'>\s*(\d+) plays",
+ webpage, 'play count', fatal=False))
+
+ return {
+ 'id': talk_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index 405cb9db4..149e36467 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -36,6 +36,7 @@ class VubeIE(InfoExtractor):
'comment_count': int,
'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],
},
+ 'skip': 'Not accessible from Travis CI server',
}, {
'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
'md5': 'db7aba89d4603dadd627e9d1973946fe',
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index c3fde53f5..a6d9b5fee 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):
links_code = self._search_regex(
r'''(?xs)
(?:
- <img\s+src="/im/play.gif".*?>|
+ <img\s+src="[^"]*/play.gif".*?>|
<!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
)
(.*?)
diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py
index 1eb24a3d6..faa167e65 100644
--- a/youtube_dl/extractor/vulture.py
+++ b/youtube_dl/extractor/vulture.py
@@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):
query_webpage = self._download_webpage(
query_url, display_id, note='Downloading query page')
params_json = self._search_regex(
- r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n',
+ r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
query_webpage,
'player params')
params = json.loads(params_json)
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
index 73077a312..2037d9b3d 100644
--- a/youtube_dl/extractor/webofstories.py
+++ b/youtube_dl/extractor/webofstories.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import int_or_none
@@ -98,3 +100,42 @@ class WebOfStoriesIE(InfoExtractor):
'description': description,
'duration': duration,
}
+
+
+class WebOfStoriesPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.webofstories.com/playAll/donald.knuth',
+ 'info_dict': {
+ 'id': 'donald.knuth',
+ 'title': 'Donald Knuth (Scientist)',
+ },
+ 'playlist_mincount': 97,
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories')
+ for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage))
+ ]
+
+ title = self._search_regex(
+ r'<div id="speakerName">\s*<span>([^<]+)</span>',
+ webpage, 'speaker', default=None)
+ if title:
+ field = self._search_regex(
+ r'<span id="primaryField">([^<]+)</span>',
+ webpage, 'field', default=None)
+ if field:
+ title += ' (%s)' % field
+
+ if not title:
+ title = self._search_regex(
+ r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
+ webpage, 'title')
+
+ return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index d6dec25ca..f69d46a28 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -37,7 +37,8 @@ class WimpIE(InfoExtractor):
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
+ [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"],
+ webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index d5c26a032..a3ea26feb 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor
class WorldStarHipHopIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
+ _TESTS = [{
"url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
@@ -15,7 +15,15 @@ class WorldStarHipHopIE(InfoExtractor):
"ext": "mp4",
"title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
- }
+ }, {
+ 'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
+ 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3',
+ 'info_dict': {
+ 'id': 'wshh6a7q1ny0G34ZwuIO',
+ 'ext': 'mp4',
+ "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -26,19 +34,22 @@ class WorldStarHipHopIE(InfoExtractor):
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
video_url = self._search_regex(
- r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')
+ [r'so\.addVariable\("file","(.*?)"\)',
+ r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'],
+ webpage, 'video URL')
if 'youtube' in video_url:
return self.url_result(video_url, ie='Youtube')
video_title = self._html_search_regex(
- r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+ [r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+ r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
webpage, 'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
thumbnail = self._html_search_regex(
r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
- fatal=False)
+ default=None)
if not thumbnail:
_title = r'candytitles.*>(.*)</span>'
mobj = re.search(_title, webpage)
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
index 80c48c37d..4ff99e5ca 100644
--- a/youtube_dl/extractor/xbef.py
+++ b/youtube_dl/extractor/xbef.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
class XBefIE(InfoExtractor):
@@ -30,7 +28,7 @@ class XBefIE(InfoExtractor):
config_url_enc = self._download_webpage(
'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
note='Retrieving config URL')
- config_url = compat_urllib_parse.unquote(config_url_enc)
+ config_url = compat_urllib_parse_unquote(config_url_enc)
config = self._download_xml(
config_url, video_id, note='Retrieving config')
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 4527567f8..97315750f 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -4,7 +4,6 @@ import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
unified_strdate,
str_to_int,
int_or_none,
@@ -13,7 +12,6 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- """Information Extractor for xHamster"""
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [
{
@@ -23,7 +21,7 @@ class XHamsterIE(InfoExtractor):
'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014',
- 'uploader_id': 'Ruseful2011',
+ 'uploader': 'Ruseful2011',
'duration': 893,
'age_limit': 18,
}
@@ -35,7 +33,7 @@ class XHamsterIE(InfoExtractor):
'ext': 'mp4',
'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914',
- 'uploader_id': 'jojo747400',
+ 'uploader': 'jojo747400',
'duration': 200,
'age_limit': 18,
}
@@ -47,12 +45,12 @@ class XHamsterIE(InfoExtractor):
]
def _real_extract(self, url):
- def extract_video_url(webpage):
- mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
- if mp4 is None:
- raise ExtractorError('Unable to extract media URL')
- else:
- return mp4.group(1)
+ def extract_video_url(webpage, name):
+ return self._search_regex(
+ [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
+ r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
+ r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
+ webpage, name, group='mp4')
def is_hd(webpage):
return '<div class=\'icon iconHD\'' in webpage
@@ -76,10 +74,14 @@ class XHamsterIE(InfoExtractor):
if upload_date:
upload_date = unified_strdate(upload_date)
- uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
- webpage, 'uploader id', default='anonymous')
+ uploader = self._html_search_regex(
+ r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)",
+ webpage, 'uploader', default='anonymous')
- thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
+ thumbnail = self._search_regex(
+ [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''',
+ r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''],
+ webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
webpage, 'duration', fatal=False))
@@ -98,7 +100,9 @@ class XHamsterIE(InfoExtractor):
hd = is_hd(webpage)
- video_url = extract_video_url(webpage)
+ format_id = 'hd' if hd else 'sd'
+
+ video_url = extract_video_url(webpage, format_id)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
@@ -109,7 +113,7 @@ class XHamsterIE(InfoExtractor):
mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
- video_url = extract_video_url(webpage)
+ video_url = extract_video_url(webpage, 'hd')
formats.append({
'url': video_url,
'format_id': 'hd',
@@ -123,7 +127,7 @@ class XHamsterIE(InfoExtractor):
'title': title,
'description': description,
'upload_date': upload_date,
- 'uploader_id': uploader_id,
+ 'uploader': uploader,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
@@ -133,3 +137,36 @@ class XHamsterIE(InfoExtractor):
'age_limit': age_limit,
'formats': formats,
}
+
+
+class XHamsterEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://xhamster.com/xembed.php?video=3328539',
+ 'info_dict': {
+ 'id': '3328539',
+ 'ext': 'mp4',
+ 'title': 'Pen Masturbation',
+ 'upload_date': '20140728',
+ 'uploader_id': 'anonymous',
+ 'duration': 5,
+ 'age_limit': 18,
+ }
+ }
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+ webpage, 'xhamster url')
+
+ return self.url_result(video_url, 'XHamster')
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py
index 8c6241aed..7c9d8af6f 100644
--- a/youtube_dl/extractor/xminus.py
+++ b/youtube_dl/extractor/xminus.py
@@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):
r'minus_track\.dur_sec=\'([0-9]*?)\'',
webpage, 'duration', fatal=False))
filesize_approx = parse_filesize(self._html_search_regex(
- r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+ r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
webpage, 'approximate filesize', fatal=False))
tbr = int_or_none(self._html_search_regex(
r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
@@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):
description = re.sub(' *\r *', '\n', description)
enc_token = self._html_search_regex(
- r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
+ r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
token = ''.join(
c if pos == 3 else compat_chr(compat_ord(c) - 1)
for pos, c in enumerate(reversed(enc_token)))
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 79ed6c744..5a41f8ffa 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -2,9 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
class XNXXIE(InfoExtractor):
@@ -26,7 +24,7 @@ class XNXXIE(InfoExtractor):
video_url = self._search_regex(r'flv_url=(.*?)&amp;',
webpage, 'video URL')
- video_url = compat_urllib_parse.unquote(video_url)
+ video_url = compat_urllib_parse_unquote(video_url)
video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
webpage, 'title')
diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py
new file mode 100644
index 000000000..71584c291
--- /dev/null
+++ b/youtube_dl/extractor/xstream.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ xpath_with_ns,
+ xpath_text,
+ find_xpath_attr,
+)
+
+
+class XstreamIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ xstream:|
+ https?://frontend\.xstream\.(?:dk|net)/
+ )
+ (?P<partner_id>[^/]+)
+ (?:
+ :|
+ /feed/video/\?.*?\bid=
+ )
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588',
+ 'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+ 'info_dict': {
+ 'id': '86588',
+ 'ext': 'mov',
+ 'title': 'Otto Wollertsen',
+ 'description': 'Vestlendingen Otto Fredrik Wollertsen',
+ 'timestamp': 1430473209,
+ 'upload_date': '20150501',
+ },
+ }, {
+ 'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ partner_id = mobj.group('partner_id')
+ video_id = mobj.group('id')
+
+ data = self._download_xml(
+ 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'
+ % (partner_id, video_id),
+ video_id)
+
+ NS_MAP = {
+ 'atom': 'http://www.w3.org/2005/Atom',
+ 'xt': 'http://xstream.dk/',
+ 'media': 'http://search.yahoo.com/mrss/',
+ }
+
+ entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
+
+ title = xpath_text(
+ entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
+ description = xpath_text(
+ entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
+ timestamp = parse_iso8601(xpath_text(
+ entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
+
+ formats = []
+ media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
+ for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
+ media_url = media_content.get('url')
+ if not media_url:
+ continue
+ tbr = int_or_none(media_content.get('bitrate'))
+ mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
+ if mobj:
+ formats.append({
+ 'url': mobj.group('url'),
+ 'play_path': 'mp4:%s' % mobj.group('playpath'),
+ 'app': mobj.group('app'),
+ 'ext': 'flv',
+ 'tbr': tbr,
+ 'format_id': 'rtmp-%d' % tbr,
+ })
+ else:
+ formats.append({
+ 'url': media_url,
+ 'tbr': tbr,
+ })
+ self._sort_formats(formats)
+
+ link = find_xpath_attr(
+ entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
+ if link is not None:
+ formats.append({
+ 'url': link.get('href'),
+ 'format_id': link.get('rel'),
+ })
+
+ thumbnails = [{
+ 'url': splash.get('url'),
+ 'width': int_or_none(splash.get('width')),
+ 'height': int_or_none(splash.get('height')),
+ } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 1644f53c8..779e4f46a 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_request,
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
)
from ..utils import (
parse_duration,
@@ -59,7 +59,7 @@ class XTubeIE(InfoExtractor):
for format_id, video_url in re.findall(
r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
fmt = {
- 'url': compat_urllib_parse.unquote(video_url),
+ 'url': compat_urllib_parse_unquote(video_url),
'format_id': format_id,
}
m = re.search(r'^(?P<height>\d+)[pP]', format_id)
@@ -68,7 +68,7 @@ class XTubeIE(InfoExtractor):
formats.append(fmt)
if not formats:
- video_url = compat_urllib_parse.unquote(self._search_regex(
+ video_url = compat_urllib_parse_unquote(self._search_regex(
r'flashvars\.video_url\s*=\s*"([^"]+)"',
webpage, 'video URL'))
formats.append({'url': video_url})
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py
index 81d885fdc..5aac8adb3 100644
--- a/youtube_dl/extractor/xuite.py
+++ b/youtube_dl/extractor/xuite.py
@@ -13,6 +13,7 @@ from ..utils import (
class XuiteIE(InfoExtractor):
+ IE_DESC = '隨意窩Xuite影音'
_REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
_VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P<id>%s)' % _REGEX_BASE64
_TESTS = [{
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 2a45dc574..5dcf2fdd1 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -4,11 +4,13 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_request,
)
from ..utils import (
clean_html,
ExtractorError,
+ determine_ext,
)
@@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):
}
}
+ _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -33,16 +37,37 @@ class XVideosIE(InfoExtractor):
if mobj:
raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
- video_url = compat_urllib_parse.unquote(
+ video_url = compat_urllib_parse_unquote(
self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ formats = [{
+ 'url': video_url,
+ }]
+
+ android_req = compat_urllib_request.Request(url)
+ android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
+ android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+
+ if android_webpage is not None:
+ player_params_str = self._search_regex(
+ 'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
+ android_webpage, 'player parameters', default='')
+ player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
+ if player_params:
+ formats.extend([{
+ 'url': param,
+ 'preference': -10,
+ } for param in player_params if determine_ext(param) == 'mp4'])
+
+ self._sort_formats(formats)
+
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index bf4e659ac..f9afbdbab 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,6 +15,7 @@ from ..utils import (
unescapeHTML,
ExtractorError,
int_or_none,
+ mimetype2ext,
)
from .nbc import NBCSportsVPlayerIE
@@ -236,6 +237,22 @@ class YahooIE(InfoExtractor):
self._sort_formats(formats)
+ closed_captions = self._html_search_regex(
+ r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
+ default='[]')
+
+ cc_json = self._parse_json(closed_captions, video_id, fatal=False)
+ subtitles = {}
+ if cc_json:
+ for closed_caption in cc_json:
+ lang = closed_caption['lang']
+ if lang not in subtitles:
+ subtitles[lang] = []
+ subtitles[lang].append({
+ 'url': closed_caption['url'],
+ 'ext': mimetype2ext(closed_caption['content_type']),
+ })
+
return {
'id': video_id,
'display_id': display_id,
@@ -244,6 +261,7 @@ class YahooIE(InfoExtractor):
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
'duration': int_or_none(meta.get('duration')),
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py
index 19f8762ae..001ee17b6 100644
--- a/youtube_dl/extractor/yam.py
+++ b/youtube_dl/extractor/yam.py
@@ -9,10 +9,12 @@ from ..utils import (
float_or_none,
month_by_abbreviation,
ExtractorError,
+ get_element_by_attribute,
)
class YamIE(InfoExtractor):
+ IE_DESC = '蕃薯藤yam天空部落'
_VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)'
_TESTS = [{
@@ -23,6 +25,7 @@ class YamIE(InfoExtractor):
'id': '2283921',
'ext': 'mp3',
'title': '發現 - 趙薇 京華煙雲主題曲',
+ 'description': '發現 - 趙薇 京華煙雲主題曲',
'uploader_id': 'princekt',
'upload_date': '20080807',
'duration': 313.0,
@@ -55,6 +58,17 @@ class YamIE(InfoExtractor):
'ext': 'mp4',
},
'skip': 'invalid YouTube URL',
+ }, {
+ 'url': 'http://mymedia.yam.com/m/2373534',
+ 'md5': '7ff74b91b7a817269d83796f8c5890b1',
+ 'info_dict': {
+ 'id': '2373534',
+ 'ext': 'mp3',
+ 'title': '林俊傑&蔡卓妍-小酒窩',
+ 'description': 'md5:904003395a0fcce6cfb25028ff468420',
+ 'upload_date': '20080928',
+ 'uploader_id': 'onliner2',
+ }
}]
def _real_extract(self, url):
@@ -75,15 +89,19 @@ class YamIE(InfoExtractor):
if youtube_url:
return self.url_result(youtube_url, 'Youtube')
+ title = self._html_search_regex(
+ r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
+
api_page = self._download_webpage(
'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
note='Downloading API page')
api_result_obj = compat_urlparse.parse_qs(api_page)
+ info_table = get_element_by_attribute('class', 'info', page)
uploader_id = self._html_search_regex(
- r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z]+)"',
- page, 'uploader id', fatal=False)
- mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2}) ' +
+ r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"',
+ info_table, 'uploader id', fatal=False)
+ mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
if mobj:
upload_date = '%s%02d%02d' % (
@@ -97,7 +115,8 @@ class YamIE(InfoExtractor):
return {
'id': video_id,
'url': api_result_obj['mp3file'][0],
- 'title': self._html_search_meta('description', page),
+ 'title': title,
+ 'description': self._html_search_meta('description', page),
'duration': duration,
'uploader_id': uploader_id,
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index f4c0f5702..4098e4629 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -1,18 +1,38 @@
-# coding=utf-8
+# coding: utf-8
from __future__ import unicode_literals
import re
import hashlib
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
int_or_none,
float_or_none,
)
-class YandexMusicBaseIE(InfoExtractor):
+class YandexMusicTrackIE(InfoExtractor):
+ IE_NAME = 'yandexmusic:track'
+ IE_DESC = 'Яндекс.Музыка - Трек'
+ _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://music.yandex.ru/album/540508/track/4878838',
+ 'md5': 'f496818aa2f60b6c0062980d2e00dc20',
+ 'info_dict': {
+ 'id': '4878838',
+ 'ext': 'mp3',
+ 'title': 'Carlo Ambrosio - Gypsy Eyes 1',
+ 'filesize': 4628061,
+ 'duration': 193.04,
+ }
+ }
+
def _get_track_url(self, storage_dir, track_id):
data = self._download_json(
'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s'
@@ -35,24 +55,6 @@ class YandexMusicBaseIE(InfoExtractor):
'duration': float_or_none(track.get('durationMs'), 1000),
}
-
-class YandexMusicTrackIE(YandexMusicBaseIE):
- IE_NAME = 'yandexmusic:track'
- IE_DESC = 'Яндекс.Музыка - Трек'
- _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
-
- _TEST = {
- 'url': 'http://music.yandex.ru/album/540508/track/4878838',
- 'md5': 'f496818aa2f60b6c0062980d2e00dc20',
- 'info_dict': {
- 'id': '4878838',
- 'ext': 'mp3',
- 'title': 'Carlo Ambrosio - Gypsy Eyes 1',
- 'filesize': 4628061,
- 'duration': 193.04,
- }
- }
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
album_id, track_id = mobj.group('album_id'), mobj.group('id')
@@ -64,7 +66,15 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
return self._get_track_info(track)
-class YandexMusicAlbumIE(YandexMusicBaseIE):
+class YandexMusicPlaylistBaseIE(InfoExtractor):
+ def _build_playlist(self, tracks):
+ return [
+ self.url_result(
+ 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id']))
+ for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)]
+
+
+class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
IE_NAME = 'yandexmusic:album'
IE_DESC = 'Яндекс.Музыка - Альбом'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)'
@@ -85,7 +95,7 @@ class YandexMusicAlbumIE(YandexMusicBaseIE):
'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id,
album_id, 'Downloading album JSON')
- entries = [self._get_track_info(track) for track in album['volumes'][0]]
+ entries = self._build_playlist(album['volumes'][0])
title = '%s - %s' % (album['artists'][0]['name'], album['title'])
year = album.get('year')
@@ -95,12 +105,12 @@ class YandexMusicAlbumIE(YandexMusicBaseIE):
return self.playlist_result(entries, compat_str(album['id']), title)
-class YandexMusicPlaylistIE(YandexMusicBaseIE):
+class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
IE_NAME = 'yandexmusic:playlist'
IE_DESC = 'Яндекс.Музыка - Плейлист'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
'info_dict': {
'id': '1245',
@@ -108,20 +118,54 @@ class YandexMusicPlaylistIE(YandexMusicBaseIE):
'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
},
'playlist_count': 6,
- }
+ }, {
+ # playlist exceeding the limit of 150 tracks shipped with webpage (see
+ # https://github.com/rg3/youtube-dl/issues/6666)
+ 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036',
+ 'info_dict': {
+ 'id': '1036',
+ 'title': 'Музыка 90-х',
+ },
+ 'playlist_count': 310,
+ }]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- playlist = self._parse_json(
+ mu = self._parse_json(
self._search_regex(
r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
- playlist_id)['pageData']['playlist']
-
- entries = [self._get_track_info(track) for track in playlist['tracks']]
+ playlist_id)
+
+ playlist = mu['pageData']['playlist']
+ tracks, track_ids = playlist['tracks'], playlist['trackIds']
+
+ # tracks dictionary shipped with webpage is limited to 150 tracks,
+ # missing tracks should be retrieved manually.
+ if len(tracks) < len(track_ids):
+ present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')])
+ missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids)
+ request = compat_urllib_request.Request(
+ 'https://music.yandex.ru/handlers/track-entries.jsx',
+ compat_urllib_parse.urlencode({
+ 'entries': ','.join(missing_track_ids),
+ 'lang': mu.get('settings', {}).get('lang', 'en'),
+ 'external-domain': 'music.yandex.ru',
+ 'overembed': 'false',
+ 'sign': mu.get('authData', {}).get('user', {}).get('sign'),
+ 'strict': 'true',
+ }).encode('utf-8'))
+ request.add_header('Referer', url)
+ request.add_header('X-Requested-With', 'XMLHttpRequest')
+
+ missing_tracks = self._download_json(
+ request, playlist_id, 'Downloading missing tracks JSON', fatal=False)
+ if missing_tracks:
+ tracks.extend(missing_tracks)
return self.playlist_result(
- entries, compat_str(playlist_id),
+ self._build_playlist(tracks),
+ compat_str(playlist_id),
playlist['title'], playlist.get('description'))
diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py
new file mode 100644
index 000000000..834d860af
--- /dev/null
+++ b/youtube_dl/extractor/yinyuetai.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class YinYueTaiIE(InfoExtractor):
+ IE_NAME = 'yinyuetai:video'
+ IE_DESC = '音悦Tai'
+ _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://v.yinyuetai.com/video/2322376',
+ 'md5': '6e3abe28d38e3a54b591f9f040595ce0',
+ 'info_dict': {
+ 'id': '2322376',
+ 'ext': 'mp4',
+ 'title': '少女时代_PARTY_Music Video Teaser',
+ 'creator': '少女时代',
+ 'duration': 25,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://v.yinyuetai.com/video/h5/2322376',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._download_json(
+ 'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id,
+ 'Downloading mv info')['videoInfo']['coreVideoInfo']
+
+ if info['error']:
+ raise ExtractorError(info['errorMsg'], expected=True)
+
+ formats = [{
+ 'url': format_info['videoUrl'],
+ 'format_id': format_info['qualityLevel'],
+ 'format': format_info.get('qualityLevelName'),
+ 'filesize': format_info.get('fileSize'),
+ # though URLs ends with .flv, the downloaded files are in fact mp4
+ 'ext': 'mp4',
+ 'tbr': format_info.get('bitrate'),
+ } for format_info in info['videoUrlModels']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info['videoName'],
+ 'thumbnail': info.get('bigHeadImage'),
+ 'creator': info.get('artistNames'),
+ 'duration': info.get('duration'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
index 894678a23..869f3e819 100644
--- a/youtube_dl/extractor/ynet.py
+++ b/youtube_dl/extractor/ynet.py
@@ -5,7 +5,7 @@ import re
import json
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
class YnetIE(InfoExtractor):
@@ -34,7 +34,7 @@ class YnetIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
+ content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage))
config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
f4m_url = config['clip']['url']
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 97b98bbe8..78caeb8b3 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -1,123 +1,236 @@
# coding: utf-8
-
from __future__ import unicode_literals
-import math
-import random
-import re
-import time
+import base64
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..utils import ExtractorError
+
+from ..compat import (
+ compat_urllib_parse,
+ compat_ord,
+ compat_urllib_request,
)
class YoukuIE(InfoExtractor):
+ IE_NAME = 'youku'
+ IE_DESC = '优酷'
_VALID_URL = r'''(?x)
(?:
http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
youku:)
(?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
'''
- _TEST = {
- 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html',
- 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b',
- 'params': {
- 'test': False
+
+ _TESTS = [{
+ 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
+ 'md5': '5f3af4192eabacc4501508d54a8cabd7',
+ 'info_dict': {
+ 'id': 'XMTc1ODE5Njcy_part1',
+ 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
+ 'ext': 'flv'
+ }
+ }, {
+ 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
+ 'info_dict': {
+ 'id': 'XODgxNjg1Mzk2',
+ 'title': '武媚娘传奇 85',
},
+ 'playlist_count': 11,
+ }, {
+ 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
'info_dict': {
- 'id': 'XNDgyMDQ2NTQw_part00',
- 'ext': 'flv',
- 'title': 'youtube-dl test video "\'/\\ä↭𝕐'
+ 'id': 'XMTI1OTczNDM5Mg',
+ 'title': '花千骨 04',
+ },
+ 'playlist_count': 13,
+ 'skip': 'Available in China only',
+ }]
+
+ def construct_video_urls(self, data1, data2):
+ # get sid, token
+ def yk_t(s1, s2):
+ ls = list(range(256))
+ t = 0
+ for i in range(256):
+ t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
+ ls[i], ls[t] = ls[t], ls[i]
+ s = bytearray()
+ x, y = 0, 0
+ for i in range(len(s2)):
+ y = (y + 1) % 256
+ x = (x + ls[y]) % 256
+ ls[x], ls[y] = ls[y], ls[x]
+ s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
+ return bytes(s)
+
+ sid, token = yk_t(
+ b'becaf9be', base64.b64decode(data2['ep'].encode('ascii'))
+ ).decode('ascii').split('_')
+
+ # get oip
+ oip = data2['ip']
+
+ # get fileid
+ string_ls = list(
+ 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
+ shuffled_string_ls = []
+ seed = data1['seed']
+ N = len(string_ls)
+ for ii in range(N):
+ seed = (seed * 0xd3 + 0x754f) % 0x10000
+ idx = seed * len(string_ls) // 0x10000
+ shuffled_string_ls.append(string_ls[idx])
+ del string_ls[idx]
+
+ fileid_dict = {}
+ for format in data1['streamtypes']:
+ streamfileid = [
+ int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
+ fileid = ''.join(
+ [shuffled_string_ls[i] for i in streamfileid])
+ fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
+
+ def get_fileid(format, n):
+ fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
+ return fileid
+
+ # get ep
+ def generate_ep(format, n):
+ fileid = get_fileid(format, n)
+ ep_t = yk_t(
+ b'bf7e5f01',
+ ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
+ )
+ ep = base64.b64encode(ep_t).decode('ascii')
+ return ep
+
+ # generate video_urls
+ video_urls_dict = {}
+ for format in data1['streamtypes']:
+ video_urls = []
+ for dt in data1['segs'][format]:
+ n = str(int(dt['no']))
+ param = {
+ 'K': dt['k'],
+ 'hd': self.get_hd(format),
+ 'myp': 0,
+ 'ts': dt['seconds'],
+ 'ypp': 0,
+ 'ctype': 12,
+ 'ev': 1,
+ 'token': token,
+ 'oip': oip,
+ 'ep': generate_ep(format, n)
+ }
+ video_url = \
+ 'http://k.youku.com/player/getFlvPath/' + \
+ 'sid/' + sid + \
+ '_' + str(int(n) + 1).zfill(2) + \
+ '/st/' + self.parse_ext_l(format) + \
+ '/fileid/' + get_fileid(format, n) + '?' + \
+ compat_urllib_parse.urlencode(param)
+ video_urls.append(video_url)
+ video_urls_dict[format] = video_urls
+
+ return video_urls_dict
+
+ def get_hd(self, fm):
+ hd_id_dict = {
+ 'flv': '0',
+ 'mp4': '1',
+ 'hd2': '2',
+ 'hd3': '3',
+ '3gp': '0',
+ '3gphd': '1'
}
- }
-
- def _gen_sid(self):
- nowTime = int(time.time() * 1000)
- random1 = random.randint(1000, 1998)
- random2 = random.randint(1000, 9999)
-
- return "%d%d%d" % (nowTime, random1, random2)
-
- def _get_file_ID_mix_string(self, seed):
- mixed = []
- source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
- seed = float(seed)
- for i in range(len(source)):
- seed = (seed * 211 + 30031) % 65536
- index = math.floor(seed / 65536 * len(source))
- mixed.append(source[int(index)])
- source.remove(source[int(index)])
- # return ''.join(mixed)
- return mixed
-
- def _get_file_id(self, fileId, seed):
- mixed = self._get_file_ID_mix_string(seed)
- ids = fileId.split('*')
- realId = []
- for ch in ids:
- if ch:
- realId.append(mixed[int(ch)])
- return ''.join(realId)
+ return hd_id_dict[fm]
+
+ def parse_ext_l(self, fm):
+ ext_dict = {
+ 'flv': 'flv',
+ 'mp4': 'mp4',
+ 'hd2': 'flv',
+ 'hd3': 'flv',
+ '3gp': 'flv',
+ '3gphd': 'mp4'
+ }
+ return ext_dict[fm]
+
+ def get_format_name(self, fm):
+ _dict = {
+ '3gp': 'h6',
+ '3gphd': 'h5',
+ 'flv': 'h4',
+ 'mp4': 'h3',
+ 'hd2': 'h2',
+ 'hd3': 'h1'
+ }
+ return _dict[fm]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
+ video_id = self._match_id(url)
- config = self._download_json(info_url, video_id)
+ def retrieve_data(req_url, note):
+ req = compat_urllib_request.Request(req_url)
- error_code = config['data'][0].get('error_code')
- if error_code:
- # -8 means blocked outside China.
- error = config['data'][0].get('error') # Chinese and English, separated by newline.
- raise ExtractorError(error or 'Server reported error %i' % error_code,
- expected=True)
+ cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+ if cn_verification_proxy:
+ req.add_header('Ytdl-request-proxy', cn_verification_proxy)
- video_title = config['data'][0]['title']
- seed = config['data'][0]['seed']
+ raw_data = self._download_json(req, video_id, note=note)
+ return raw_data['data'][0]
- format = self._downloader.params.get('format', None)
- supported_format = list(config['data'][0]['streamfileids'].keys())
+ # request basic data
+ data1 = retrieve_data(
+ 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id,
+ 'Downloading JSON metadata 1')
+ data2 = retrieve_data(
+ 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id,
+ 'Downloading JSON metadata 2')
- # TODO proper format selection
- if format is None or format == 'best':
- if 'hd2' in supported_format:
- format = 'hd2'
+ error_code = data1.get('error_code')
+ if error_code:
+ error = data1.get('error')
+ if error is not None and '因版权原因无法观看此视频' in error:
+ raise ExtractorError(
+ 'Youku said: Sorry, this video is available in China only', expected=True)
else:
- format = 'flv'
- ext = 'flv'
- elif format == 'worst':
- format = 'mp4'
- ext = 'mp4'
- else:
- format = 'flv'
- ext = 'flv'
-
- fileid = config['data'][0]['streamfileids'][format]
- keys = [s['k'] for s in config['data'][0]['segs'][format]]
- # segs is usually a dictionary, but an empty *list* if an error occured.
-
- files_info = []
- sid = self._gen_sid()
- fileid = self._get_file_id(fileid, seed)
-
- # column 8,9 of fileid represent the segment number
- # fileid[7:9] should be changed
- for index, key in enumerate(keys):
- temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
- download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
-
- info = {
- 'id': '%s_part%02d' % (video_id, index),
- 'url': download_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': ext,
- }
- files_info.append(info)
-
- return files_info
+ msg = 'Youku server reported error %i' % error_code
+ if error is not None:
+ msg += ': ' + error
+ raise ExtractorError(msg)
+
+ title = data1['title']
+
+ # generate video_urls_dict
+ video_urls_dict = self.construct_video_urls(data1, data2)
+
+ # construct info
+ entries = [{
+ 'id': '%s_part%d' % (video_id, i + 1),
+ 'title': title,
+ 'formats': [],
+ # some formats are not available for all parts, we have to detect
+ # which one has all
+ } for i in range(max(len(v) for v in data1['segs'].values()))]
+ for fm in data1['streamtypes']:
+ video_urls = video_urls_dict[fm]
+ for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries):
+ entry['formats'].append({
+ 'url': video_url,
+ 'format_id': self.get_format_name(fm),
+ 'ext': self.parse_ext_l(fm),
+ 'filesize': int(seg['size']),
+ })
+
+ return {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'title': title,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 0869c9fd4..030ec70ca 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -17,6 +17,9 @@ from ..compat import (
compat_chr,
compat_parse_qs,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
+ compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urlparse,
compat_str,
@@ -29,16 +32,22 @@ from ..utils import (
get_element_by_id,
int_or_none,
orderedSet,
+ parse_duration,
+ remove_start,
+ smuggle_url,
+ str_to_int,
unescapeHTML,
unified_strdate,
+ unsmuggle_url,
uppercase_escape,
+ ISO3166Utils,
)
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
+ _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
@@ -49,6 +58,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# YouTube sets the expire time to about two months
expire_time=time.time() + 2 * 30 * 24 * 3600)
+ def _ids_to_results(self, ids):
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
def _login(self):
"""
Attempt to log in to YouTube.
@@ -115,40 +129,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# Two-Factor
# TODO add SMS and phone call support - these require making a request and then prompting the user
- if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
- tfa_code = self._get_tfa_info()
+ if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
+ tfa_code = self._get_tfa_info('2-step verification code')
- if tfa_code is None:
- self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
- self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ if not tfa_code:
+ self._downloader.report_warning(
+ 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+ '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
return False
- # Unlike the first login form, secTok and timeStmp are both required for the TFA form
-
- match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
- if match is None:
- self._downloader.report_warning('Failed to get secTok - did the page structure change?')
- secTok = match.group(1)
- match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
- if match is None:
- self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
- timeStmp = match.group(1)
-
- tfa_form_strs = {
- 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- 'smsToken': '',
- 'smsUserPin': tfa_code,
- 'smsVerifyPin': 'Verify',
-
- 'PersistentCookie': 'yes',
- 'checkConnection': '',
- 'checkedDomains': 'youtube',
- 'pstMsg': '1',
- 'secTok': secTok,
- 'timeStmp': timeStmp,
- 'service': 'youtube',
- 'hl': 'en_US',
- }
+ tfa_code = remove_start(tfa_code, 'G-')
+
+ tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+
+ tfa_form_strs.update({
+ 'Pin': tfa_code,
+ 'TrustDevice': 'on',
+ })
+
tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
@@ -160,8 +158,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if tfa_results is False:
return False
- if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
- self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
+ if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
+ self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
return False
if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
self._downloader.report_warning('unable to log in - did the page structure change?')
@@ -200,11 +198,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
))
- |youtu\.be/ # just youtu.be/xxxx
+ |(?:
+ youtu\.be| # just youtu.be/xxxx
+ vid\.plus # or vid.plus/xxxx
+ )/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
@@ -229,6 +230,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'44': {'ext': 'webm', 'width': 854, 'height': 480},
'45': {'ext': 'webm', 'width': 1280, 'height': 720},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480},
# 3d videos
@@ -268,13 +271,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
# Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
@@ -284,11 +287,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
- '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
- '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
- '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
- '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
- '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
# Dash webm audio
'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
@@ -306,7 +309,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube'
_TESTS = [
{
- 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
'info_dict': {
'id': 'BaW_jenozKc',
'ext': 'mp4',
@@ -316,8 +319,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
'like_count': int,
'dislike_count': int,
+ 'start_time': 1,
+ 'end_time': 9,
}
},
{
@@ -328,7 +334,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
- 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
+ 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+ 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+ 'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
}
@@ -344,6 +353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
+ 'age_limit': 18,
}
},
{
@@ -360,6 +370,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
},
{
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
+ 'note': 'Use the first video ID in the URL',
+ 'info_dict': {
+ 'id': 'BaW_jenozKc',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_id': 'phihag',
+ 'upload_date': '20121002',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
'note': '256k DASH audio (format 141) via DASH manifest',
'info_dict': {
@@ -400,7 +430,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
- 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
+ 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
@@ -434,6 +464,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
'upload_date': '20140605',
+ 'age_limit': 18,
},
},
# Age-gate video with encrypted signature
@@ -447,6 +478,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO',
'upload_date': '20110629',
+ 'age_limit': 18,
},
},
# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
@@ -471,7 +503,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'lqQg6PlCWgI',
'ext': 'mp4',
- 'upload_date': '20120731',
+ 'upload_date': '20120724',
'uploader_id': 'olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
@@ -500,7 +532,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'qEJwOuvDf7I',
'info_dict': {
'id': 'qEJwOuvDf7I',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
'description': '',
'upload_date': '20150404',
@@ -511,6 +543,95 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': 'requires avconv',
}
},
+ # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+ {
+ 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+ 'info_dict': {
+ 'id': 'FIl7x6_3R5Y',
+ 'ext': 'mp4',
+ 'title': 'md5:7b81415841e02ecd4313668cde88737a',
+ 'description': 'md5:116377fd2963b81ec4ce64b542173306',
+ 'upload_date': '20150625',
+ 'uploader_id': 'dorappi2000',
+ 'uploader': 'dorappi2000',
+ 'formats': 'mincount:33',
+ },
+ },
+ # DASH manifest with segment_list
+ {
+ 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
+ 'md5': '8ce563a1d667b599d21064e982ab9e31',
+ 'info_dict': {
+ 'id': 'CsmdDsKjzN8',
+ 'ext': 'mp4',
+ 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
+ 'uploader': 'Airtek',
+ 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
+ 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+ 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '135', # bestvideo
+ }
+ },
+ {
+ # Multifeed videos (multiple cameras), URL is for Main Camera
+ 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'title': 'teamPGP: Rocket League Noob Stream',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '6h8e8xoXJzg',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'PUOgX5z9xZw',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'teuwxikvS5k',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (zim)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://vid.plus/FlRa-iH7PGw',
+ 'only_matching': True,
+ }
]
def __init__(self, *args, **kwargs):
@@ -539,7 +660,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -775,16 +896,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
def _parse_dash_manifest(
- self, video_id, dash_manifest_url, player_url, age_gate):
+ self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
def decrypt_sig(mobj):
s = mobj.group(1)
dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
return '/signature/%s' % dec_s
- dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+ dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
dash_manifest_url, video_id,
note='Downloading DASH manifest',
- errnote='Could not download DASH manifest')
+ errnote='Could not download DASH manifest',
+ fatal=fatal)
+
+ if dash_doc is False:
+ return []
formats = []
for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
@@ -797,6 +922,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# TODO implement WebVTT downloading
pass
elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
+ segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
format_id = r.attrib['id']
video_url = url_el.text
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
@@ -810,6 +936,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'filesize': filesize,
'fps': int_or_none(r.attrib.get('frameRate')),
}
+ if segment_list is not None:
+ f.update({
+ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
+ 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
+ 'protocol': 'http_dash_segments',
+ })
try:
existing_format = next(
fo for fo in formats
@@ -817,6 +949,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
except StopIteration:
full_info = self._formats.get(format_id, {}).copy()
full_info.update(f)
+ codecs = r.attrib.get('codecs')
+ if codecs:
+ if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
+ full_info['vcodec'] = codecs
+ elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
+ full_info['acodec'] = codecs
formats.append(full_info)
else:
existing_format.update(f)
@@ -825,14 +963,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return formats
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
proto = (
'http' if self._downloader.params.get('prefer_insecure', False)
else 'https')
+ start_time = None
+ end_time = None
+ parsed_url = compat_urllib_parse_urlparse(url)
+ for component in [parsed_url.fragment, parsed_url.query]:
+ query = compat_parse_qs(component)
+ if start_time is None and 't' in query:
+ start_time = parse_duration(query['t'][0])
+ if start_time is None and 'start' in query:
+ start_time = parse_duration(query['start'][0])
+ if end_time is None and 'end' in query:
+ end_time = parse_duration(query['end'][0])
+
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
- url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url)
# Get video webpage
@@ -846,8 +998,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
player_url = None
+ dash_mpds = []
+
+ def add_dash_mpd(video_info):
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd and dash_mpd[0] not in dash_mpds:
+ dash_mpds.append(dash_mpd[0])
+
# Get video info
embed_webpage = None
+ is_live = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -866,24 +1026,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(video_info)
else:
age_gate = False
- try:
- # Try looking directly into the video webpage
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find ytplayer.config') # caught below
+ video_info = None
+ # Try looking directly into the video webpage
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+ if mobj:
json_code = uppercase_escape(mobj.group(1))
ytplayer_config = json.loads(json_code)
args = ytplayer_config['args']
- # Convert to the same format returned by compat_parse_qs
- video_info = dict((k, [v]) for k, v in args.items())
- if not args.get('url_encoded_fmt_stream_map'):
- raise ValueError('No stream_map present') # caught below
- except ValueError:
- # We fallback to the get_video_info pages (used by the embed page)
+ if args.get('url_encoded_fmt_stream_map'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ if args.get('livestream') == '1' or args.get('live_playback') == 1:
+ is_live = True
+ if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ # We also try looking in get_video_info since it may contain different dashmpd
+ # URL that points to a DASH manifest with possibly different itag set (some itags
+ # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+ # manifest pointed by get_video_info's dashmpd).
+ # The general idea is to take a union of itags of both DASH manifests (for example
+ # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = (
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (proto, video_id, el_type))
@@ -891,11 +1058,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_info_url,
video_id, note=False,
errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
+ get_video_info = compat_parse_qs(video_info_webpage)
+ if get_video_info.get('use_cipher_signature') != ['True']:
+ add_dash_mpd(get_video_info)
+ if not video_info:
+ video_info = get_video_info
+ if 'token' in get_video_info:
break
if 'token' not in video_info:
if 'reason' in video_info:
+ if 'The uploader has not made this video available in your country.' in video_info['reason']:
+ regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+ if regions_allowed:
+ raise ExtractorError('YouTube said: This video is available in %s only' % (
+ ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+ expected=True)
raise ExtractorError(
'YouTube said: %s' % video_info['reason'][0],
expected=True, video_id=video_id)
@@ -904,6 +1081,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'"token" parameter not in video info for unknown reason',
video_id=video_id)
+ # title
+ if 'title' in video_info:
+ video_title = video_info['title'][0]
+ else:
+ self._downloader.report_warning('Unable to extract video title')
+ video_title = '_'
+
+ # description
+ video_description = get_element_by_id("eow-description", video_webpage)
+ if video_description:
+ video_description = re.sub(r'''(?x)
+ <a\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ title="([^"]+)"\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ class="yt-uix-redirect-link"\s*>
+ [^<]+
+ </a>
+ ''', r'\1', video_description)
+ video_description = clean_html(video_description)
+ else:
+ fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
+ if fd_mobj:
+ video_description = unescapeHTML(fd_mobj.group(1))
+ else:
+ video_description = ''
+
+ if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
+ if not self._downloader.params.get('noplaylist'):
+ entries = []
+ feed_ids = []
+ multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
+ for feed in multifeed_metadata_list.split(','):
+ feed_data = compat_parse_qs(feed)
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+ })
+ feed_ids.append(feed_data['id'][0])
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(entries, video_id, video_title, video_description)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
if 'view_count' in video_info:
view_count = int(video_info['view_count'][0])
else:
@@ -919,7 +1145,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# uploader
if 'author' not in video_info:
raise ExtractorError('Unable to extract uploader name')
- video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+ video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
# uploader_id
video_uploader_id = None
@@ -929,13 +1155,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
self._downloader.report_warning('unable to extract uploader nickname')
- # title
- if 'title' in video_info:
- video_title = video_info['title'][0]
- else:
- self._downloader.report_warning('Unable to extract video title')
- video_title = '_'
-
# thumbnail image
# We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
@@ -946,18 +1165,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
- video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
+ video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date
- upload_date = None
- mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
- if mobj is None:
- mobj = re.search(
- r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
- video_webpage)
- if mobj is not None:
- upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
- upload_date = unified_strdate(upload_date)
+ upload_date = self._html_search_meta(
+ 'datePublished', video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+ video_webpage, 'upload date', default=None)
+ if upload_date:
+ upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+ upload_date = unified_strdate(upload_date)
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
@@ -970,33 +1190,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
video_categories = None
- # description
- video_description = get_element_by_id("eow-description", video_webpage)
- if video_description:
- video_description = re.sub(r'''(?x)
- <a\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- title="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- class="yt-uix-redirect-link"\s*>
- [^<]+
- </a>
- ''', r'\1', video_description)
- video_description = clean_html(video_description)
- else:
- fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
- if fd_mobj:
- video_description = unescapeHTML(fd_mobj.group(1))
- else:
- video_description = ''
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
def _extract_count(count_name):
- count = self._search_regex(
- r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
- video_webpage, count_name, default=None)
- if count is not None:
- return int(count.replace(',', ''))
- return None
+ return str_to_int(self._search_regex(
+ r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+ % re.escape(count_name),
+ video_webpage, count_name, default=None))
+
like_count = _extract_count('like')
dislike_count = _extract_count('dislike')
@@ -1008,7 +1211,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning('unable to extract video duration')
video_duration = None
else:
- video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
+ video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
# annotations
video_annotations = None
@@ -1040,7 +1243,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
- url_map = {}
+ formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data:
@@ -1086,7 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- r'html5player-([^/]+?)(?:/html5player)?\.js',
+ r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1100,8 +1303,50 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
url += '&signature=' + signature
if 'ratebypass' not in url:
url += '&ratebypass=yes'
- url_map[format_id] = url
- formats = _map_to_format_list(url_map)
+
+ # Some itags are not included in DASH manifest thus corresponding formats will
+ # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
+ # Trying to extract metadata from url_encoded_fmt_stream_map entry.
+ mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
+ width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+ dct = {
+ 'format_id': format_id,
+ 'url': url,
+ 'player_url': player_url,
+ 'filesize': int_or_none(url_data.get('clen', [None])[0]),
+ 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
+ 'width': width,
+ 'height': height,
+ 'fps': int_or_none(url_data.get('fps', [None])[0]),
+ 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
+ }
+ type_ = url_data.get('type', [None])[0]
+ if type_:
+ type_split = type_.split(';')
+ kind_ext = type_split[0].split('/')
+ if len(kind_ext) == 2:
+ kind, ext = kind_ext
+ dct['ext'] = ext
+ if kind in ('audio', 'video'):
+ codecs = None
+ for mobj in re.finditer(
+ r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
+ if mobj.group('key') == 'codecs':
+ codecs = mobj.group('val')
+ break
+ if codecs:
+ codecs = codecs.split(',')
+ if len(codecs) == 2:
+ acodec, vcodec = codecs[0], codecs[1]
+ else:
+ acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
+ dct.update({
+ 'acodec': acodec,
+ 'vcodec': vcodec,
+ })
+ if format_id in self._formats:
+ dct.update(self._formats[format_id])
+ formats.append(dct)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
url_map = self._extract_from_m3u8(manifest_url, video_id)
@@ -1111,23 +1356,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
- dash_mpd = video_info.get('dashmpd')
- if dash_mpd:
- dash_manifest_url = dash_mpd[0]
+ dash_mpd_fatal = True
+ for dash_manifest_url in dash_mpds:
+ dash_formats = {}
try:
- dash_formats = self._parse_dash_manifest(
- video_id, dash_manifest_url, player_url, age_gate)
+ for df in self._parse_dash_manifest(
+ video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
+ # Do not overwrite DASH format found in some previous DASH manifest
+ if df['format_id'] not in dash_formats:
+ dash_formats[df['format_id']] = df
+ # Additional DASH manifests may end up in HTTP Error 403 therefore
+ # allow them to fail without bug report message if we already have
+ # some DASH manifest succeeded. This is temporary workaround to reduce
+ # burst of bug reports until we figure out the reason and whether it
+ # can be fixed at all.
+ dash_mpd_fatal = False
except (ExtractorError, KeyError) as e:
self.report_warning(
'Skipping DASH manifest: %r' % e, video_id)
- else:
- # Hide the formats we found through non-DASH
- dash_keys = set(df['format_id'] for df in dash_formats)
- for f in formats:
- if f['format_id'] in dash_keys:
- f['format_id'] = 'nondash-%s' % f['format_id']
- f['preference'] = f.get('preference', 0) - 10000
- formats.extend(dash_formats)
+ if dash_formats:
+ # Remove the formats we found through non-DASH, they
+ # contain less info and it can be wrong, because we use
+ # fixed values (for example the resolution). See
+ # https://github.com/rg3/youtube-dl/issues/5774 for an
+ # example.
+ formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+ formats.extend(dash_formats.values())
# Check for malformed aspect ratio
stretched_m = re.search(
@@ -1150,6 +1404,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
+ 'tags': video_tags,
'subtitles': video_subtitles,
'automatic_captions': automatic_captions,
'duration': video_duration,
@@ -1161,6 +1416,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'dislike_count': dislike_count,
'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
'formats': formats,
+ 'is_live': is_live,
+ 'start_time': start_time,
+ 'end_time': end_time,
}
@@ -1261,11 +1519,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _ids_to_results(self, ids):
- return [
- self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
@@ -1289,7 +1542,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _extract_playlist(self, playlist_id):
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
- more_widget_html = content_html = page
for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
match = match.strip()
@@ -1309,36 +1561,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
self.report_warning('Youtube gives an alert message: ' + match)
# Extract the video ids from the playlist pages
- ids = []
-
- for page_num in itertools.count(1):
- matches = re.finditer(self._VIDEO_RE, content_html)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- ids.extend(new_ids)
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+ def _entries():
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.finditer(self._VIDEO_RE, content_html)
+ # We remove the duplicates and the link with index 0
+ # (it's not the first video of the playlist)
+ new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+ for vid_id in new_ids:
+ yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex(
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
page, 'title')
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_id, playlist_title)
+ return self.playlist_result(_entries(), playlist_id, playlist_title)
def _real_extract(self, url):
# Extract playlist id
@@ -1398,6 +1650,24 @@ class YoutubeChannelIE(InfoExtractor):
channel_id = self._match_id(url)
url = self._TEMPLATE_URL % channel_id
+
+ # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+ # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+ # otherwise fallback on channel by page extraction
+ channel_page = self._download_webpage(
+ url + '?view=57', channel_id,
+ 'Downloading channel page', fatal=False)
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_playlist_id = self._search_regex(
+ r'data-channel-external-id="([^"]+)"',
+ channel_page, 'channel id', default=None)
+ if channel_playlist_id and channel_playlist_id.startswith('UC'):
+ playlist_id = 'UU' + channel_playlist_id[2:]
+ return self.url_result(
+ compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
autogenerated = re.search(r'''(?x)
class="[^"]*?(?:
@@ -1486,7 +1756,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
for pagenum in itertools.count(1):
url_query = {
- 'search_query': query,
+ 'search_query': query.encode('utf-8'),
'page': pagenum,
'spf': 'navigate',
}
@@ -1534,14 +1804,14 @@ class YoutubeSearchURLIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+ query = compat_urllib_parse_unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
part_codes = re.findall(
- r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+ r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
entries = []
for part_code in part_codes:
part_title = self._html_search_regex(
@@ -1601,20 +1871,10 @@ class YoutubeShowIE(InfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
"""
- Base class for extractors that fetch info from
- http://www.youtube.com/feed_ajax
+ Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- # use action_load_personal_feed instead of action_load_system_feed
- _PERSONAL_FEED = False
-
- @property
- def _FEED_TEMPLATE(self):
- action = 'action_load_system_feed'
- if self._PERSONAL_FEED:
- action = 'action_load_personal_feed'
- return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
@property
def IE_NAME(self):
@@ -1624,36 +1884,38 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
self._login()
def _real_extract(self, url):
- feed_entries = []
- paging = 0
- for i in itertools.count(1):
- info = self._download_json(
- self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i,
- transform_source=uppercase_escape)
- feed_html = info.get('feed_html') or info.get('content_html')
- load_more_widget_html = info.get('load_more_widget_html') or feed_html
- m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
- ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in ids)
- mobj = re.search(
- r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
- load_more_widget_html)
- if mobj is None:
+ page = self._download_webpage(
+ 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
+
+ # The extraction process is the same as for playlists, but the regex
+ # for the video ids doesn't contain an index
+ ids = []
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ if not new_ids:
break
- paging = mobj.group('paging')
- return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
+ ids.extend(new_ids)
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_NAME = 'youtube:recommended'
- IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
- _FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = 'Youtube Recommended videos'
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ return self.playlist_result(
+ self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
class YoutubeWatchLaterIE(YoutubePlaylistIE):
@@ -1667,15 +1929,6 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE):
return self._extract_playlist('WL')
-class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_NAME = 'youtube:history'
- IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
- _FEED_NAME = 'history'
- _PERSONAL_FEED = True
- _PLAYLIST_TITLE = 'Youtube Watch History'
-
-
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites'
IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
@@ -1688,42 +1941,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
return self.url_result(playlist_id, 'YoutubePlaylist')
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:subscriptions'
- IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _TESTS = []
-
- def _real_extract(self, url):
- title = 'Youtube Subscriptions'
- page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
- # The extraction process is the same as for playlists, but the regex
- # for the video ids doesn't contain an index
- ids = []
- more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _FEED_NAME = 'recommended'
+ _PLAYLIST_TITLE = 'Youtube Recommended videos'
- for page_num in itertools.count(1):
- matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
- ids.extend(new_ids)
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _FEED_NAME = 'subscriptions'
+ _PLAYLIST_TITLE = 'Youtube Subscriptions'
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), title,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': self._ids_to_results(ids),
- }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+ _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PLAYLIST_TITLE = 'Youtube History'
class YoutubeTruncatedURLIE(InfoExtractor):
diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py
index 1afbe68ed..7dc1e2f2b 100644
--- a/youtube_dl/extractor/zingmp3.py
+++ b/youtube_dl/extractor/zingmp3.py
@@ -4,12 +4,18 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import ExtractorError
class ZingMp3BaseInfoExtractor(InfoExtractor):
- @staticmethod
- def _extract_item(item):
+ def _extract_item(self, item):
+ error_message = item.find('./errormessage').text
+ if error_message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message),
+ expected=True)
+
title = item.find('./title').text.strip()
source = item.find('./source').text
extension = item.attrib['type']
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 22dbc3aec..9016e3498 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -145,12 +145,16 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--list-extractors',
action='store_true', dest='list_extractors', default=False,
- help='List all supported extractors and the URLs they would handle')
+ help='List all supported extractors')
general.add_option(
'--extractor-descriptions',
action='store_true', dest='list_extractor_descriptions', default=False,
help='Output descriptions of all supported extractors')
general.add_option(
+ '--force-generic-extractor',
+ action='store_true', dest='force_generic_extractor', default=False,
+ help='Force extraction to use the generic extractor')
+ general.add_option(
'--default-search',
dest='default_search', metavar='PREFIX',
help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.')
@@ -215,7 +219,7 @@ def parseOpts(overrideArguments=None):
selection.add_option(
'--playlist-items',
dest='playlist_items', metavar='ITEM_SPEC', default=None,
- help='Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
+ help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
selection.add_option(
'--match-title',
dest='matchtitle', metavar='REGEX',
@@ -342,12 +346,13 @@ def parseOpts(overrideArguments=None):
video_format.add_option(
'--youtube-skip-dash-manifest',
action='store_false', dest='youtube_include_dash_manifest',
- help='Do not download the DASH manifest on YouTube videos')
+ help='Do not download the DASH manifests and related data on YouTube videos')
video_format.add_option(
'--merge-output-format',
action='store', dest='merge_output_format', metavar='FORMAT', default=None,
help=(
- 'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.'
+ 'If a merge is required (e.g. bestvideo+bestaudio), '
+ 'output to given container format. One of mkv, mp4, ogg, webm, flv. '
'Ignored if no merge is required'))
subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
@@ -537,7 +542,7 @@ def parseOpts(overrideArguments=None):
verbosity.add_option(
'--dump-pages', '--dump-intermediate-pages',
action='store_true', dest='dump_intermediate_pages', default=False,
- help='Print downloaded pages to debug problems (very verbose)')
+ help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
verbosity.add_option(
'--write-pages',
action='store_true', dest='write_pages', default=False,
@@ -686,7 +691,11 @@ def parseOpts(overrideArguments=None):
postproc.add_option(
'--recode-video',
metavar='FORMAT', dest='recodevideo', default=None,
- help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)')
+ help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)')
+ postproc.add_option(
+ '--postprocessor-args',
+ dest='postprocessor_args', metavar='ARGS',
+ help='Give these arguments to the postprocessor')
postproc.add_option(
'-k', '--keep-video',
action='store_true', dest='keepvideo', default=False,
@@ -713,7 +722,7 @@ def parseOpts(overrideArguments=None):
help='Parse additional metadata like song title / artist from the video title. '
'The format syntax is the same as --output, '
'the parsed parameters replace existing values. '
- 'Additional templates: %(album), %(artist). '
+ 'Additional templates: %(album)s, %(artist)s. '
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
'"Coldplay - Paradise"')
postproc.add_option(
@@ -725,7 +734,7 @@ def parseOpts(overrideArguments=None):
metavar='POLICY', dest='fixup', default='detect_or_warn',
help='Automatically correct known faults of the file. '
'One of never (do nothing), warn (only emit a warning), '
- 'detect_or_warn(the default; fix file if we can, warn otherwise)')
+ 'detect_or_warn (the default; fix file if we can, warn otherwise)')
postproc.add_option(
'--prefer-avconv',
action='store_false', dest='prefer_ffmpeg',
diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py
index 3b0e8ddd8..4191d040b 100644
--- a/youtube_dl/postprocessor/common.py
+++ b/youtube_dl/postprocessor/common.py
@@ -23,6 +23,9 @@ class PostProcessor(object):
PostProcessor objects follow a "mutual registration" process similar
to InfoExtractor objects.
+
+ Optionally PostProcessor can use a list of additional command-line arguments
+ with self._configuration_args.
"""
_downloader = None
@@ -57,6 +60,13 @@ class PostProcessor(object):
except Exception:
self._downloader.report_warning(errnote)
+ def _configuration_args(self, default=[]):
+ pp_args = self._downloader.params.get('postprocessor_args')
+ if pp_args is None:
+ return default
+ assert isinstance(pp_args, list)
+ return pp_args
+
class AudioConversionError(PostProcessingError):
pass
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py
index 4868a42fd..e19dbf73d 100644
--- a/youtube_dl/postprocessor/embedthumbnail.py
+++ b/youtube_dl/postprocessor/embedthumbnail.py
@@ -7,12 +7,9 @@ import subprocess
from .ffmpeg import FFmpegPostProcessor
-from ..compat import (
- compat_urlretrieve,
-)
from ..utils import (
- determine_ext,
check_executable,
+ encodeArgument,
encodeFilename,
PostProcessingError,
prepend_extension,
@@ -25,34 +22,48 @@ class EmbedThumbnailPPError(PostProcessingError):
class EmbedThumbnailPP(FFmpegPostProcessor):
+ def __init__(self, downloader=None, already_have_thumbnail=False):
+ super(EmbedThumbnailPP, self).__init__(downloader)
+ self._already_have_thumbnail = already_have_thumbnail
+
def run(self, info):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
- temp_thumbnail = filename + '.' + determine_ext(info['thumbnail'])
- if not info.get('thumbnail'):
+ if not info.get('thumbnails'):
raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.')
- compat_urlretrieve(info['thumbnail'], temp_thumbnail)
+ thumbnail_filename = info['thumbnails'][-1]['filename']
+
+ if not os.path.exists(encodeFilename(thumbnail_filename)):
+ self._downloader.report_warning(
+ 'Skipping embedding the thumbnail because the file is missing.')
+ return [], info
if info['ext'] == 'mp3':
options = [
- '-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',
+ '-c', 'copy', '-map', '0', '-map', '1',
'-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
- self.run_ffmpeg(filename, temp_filename, options)
+ self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
- os.remove(encodeFilename(temp_thumbnail))
+ if not self._already_have_thumbnail:
+ os.remove(encodeFilename(thumbnail_filename))
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
- elif info['ext'] == 'm4a':
+ elif info['ext'] in ['m4a', 'mp4']:
if not check_executable('AtomicParsley', ['-v']):
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
- cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename]
+ cmd = [encodeFilename('AtomicParsley', True),
+ encodeFilename(filename, True),
+ encodeArgument('--artwork'),
+ encodeFilename(thumbnail_filename, True),
+ encodeArgument('-o'),
+ encodeFilename(temp_filename, True)]
self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
@@ -66,7 +77,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
msg = stderr.decode('utf-8', 'replace').strip()
raise EmbedThumbnailPPError(msg)
- os.remove(encodeFilename(temp_thumbnail))
+ if not self._already_have_thumbnail:
+ os.remove(encodeFilename(thumbnail_filename))
# for formats that don't support thumbnails (like 3gp) AtomicParsley
# won't create to the temporary file
if b'No changes' in stdout:
@@ -75,6 +87,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
else:
- raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.')
+ raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
return [], info
diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py
index 341437575..13794b7ba 100644
--- a/youtube_dl/postprocessor/execafterdownload.py
+++ b/youtube_dl/postprocessor/execafterdownload.py
@@ -8,8 +8,8 @@ from ..utils import PostProcessingError
class ExecAfterDownloadPP(PostProcessor):
- def __init__(self, downloader=None, verboseOutput=None, exec_cmd=None):
- self.verboseOutput = verboseOutput
+ def __init__(self, downloader, exec_cmd):
+ super(ExecAfterDownloadPP, self).__init__(downloader)
self.exec_cmd = exec_cmd
def run(self, information):
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 214de39f9..1f723908b 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -21,6 +21,7 @@ from ..utils import (
shell_quote,
subtitles_filename,
dfxp2srt,
+ ISO639Utils,
)
@@ -130,6 +131,8 @@ class FFmpegPostProcessor(PostProcessor):
oldest_mtime = min(
os.stat(encodeFilename(path)).st_mtime for path in input_paths)
+ opts += self._configuration_args()
+
files_cmd = []
for path in input_paths:
files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)])
@@ -262,7 +265,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
# If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
if (new_path == path or
(self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))):
- self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path)
+ self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path)
return [], information
try:
@@ -293,13 +296,16 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
def run(self, information):
path = information['filepath']
- prefix, sep, ext = path.rpartition('.')
- outpath = prefix + sep + self._preferedformat
if information['ext'] == self._preferedformat:
self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat))
return [], information
+ options = []
+ if self._preferedformat == 'avi':
+ options.extend(['-c:v', 'libxvid', '-vtag', 'XVID'])
+ prefix, sep, ext = path.rpartition('.')
+ outpath = prefix + sep + self._preferedformat
self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath)
- self.run_ffmpeg(path, outpath, [])
+ self.run_ffmpeg(path, outpath, options)
information['filepath'] = outpath
information['format'] = self._preferedformat
information['ext'] = self._preferedformat
@@ -307,199 +313,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
- # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
- _lang_map = {
- 'aa': 'aar',
- 'ab': 'abk',
- 'ae': 'ave',
- 'af': 'afr',
- 'ak': 'aka',
- 'am': 'amh',
- 'an': 'arg',
- 'ar': 'ara',
- 'as': 'asm',
- 'av': 'ava',
- 'ay': 'aym',
- 'az': 'aze',
- 'ba': 'bak',
- 'be': 'bel',
- 'bg': 'bul',
- 'bh': 'bih',
- 'bi': 'bis',
- 'bm': 'bam',
- 'bn': 'ben',
- 'bo': 'bod',
- 'br': 'bre',
- 'bs': 'bos',
- 'ca': 'cat',
- 'ce': 'che',
- 'ch': 'cha',
- 'co': 'cos',
- 'cr': 'cre',
- 'cs': 'ces',
- 'cu': 'chu',
- 'cv': 'chv',
- 'cy': 'cym',
- 'da': 'dan',
- 'de': 'deu',
- 'dv': 'div',
- 'dz': 'dzo',
- 'ee': 'ewe',
- 'el': 'ell',
- 'en': 'eng',
- 'eo': 'epo',
- 'es': 'spa',
- 'et': 'est',
- 'eu': 'eus',
- 'fa': 'fas',
- 'ff': 'ful',
- 'fi': 'fin',
- 'fj': 'fij',
- 'fo': 'fao',
- 'fr': 'fra',
- 'fy': 'fry',
- 'ga': 'gle',
- 'gd': 'gla',
- 'gl': 'glg',
- 'gn': 'grn',
- 'gu': 'guj',
- 'gv': 'glv',
- 'ha': 'hau',
- 'he': 'heb',
- 'hi': 'hin',
- 'ho': 'hmo',
- 'hr': 'hrv',
- 'ht': 'hat',
- 'hu': 'hun',
- 'hy': 'hye',
- 'hz': 'her',
- 'ia': 'ina',
- 'id': 'ind',
- 'ie': 'ile',
- 'ig': 'ibo',
- 'ii': 'iii',
- 'ik': 'ipk',
- 'io': 'ido',
- 'is': 'isl',
- 'it': 'ita',
- 'iu': 'iku',
- 'ja': 'jpn',
- 'jv': 'jav',
- 'ka': 'kat',
- 'kg': 'kon',
- 'ki': 'kik',
- 'kj': 'kua',
- 'kk': 'kaz',
- 'kl': 'kal',
- 'km': 'khm',
- 'kn': 'kan',
- 'ko': 'kor',
- 'kr': 'kau',
- 'ks': 'kas',
- 'ku': 'kur',
- 'kv': 'kom',
- 'kw': 'cor',
- 'ky': 'kir',
- 'la': 'lat',
- 'lb': 'ltz',
- 'lg': 'lug',
- 'li': 'lim',
- 'ln': 'lin',
- 'lo': 'lao',
- 'lt': 'lit',
- 'lu': 'lub',
- 'lv': 'lav',
- 'mg': 'mlg',
- 'mh': 'mah',
- 'mi': 'mri',
- 'mk': 'mkd',
- 'ml': 'mal',
- 'mn': 'mon',
- 'mr': 'mar',
- 'ms': 'msa',
- 'mt': 'mlt',
- 'my': 'mya',
- 'na': 'nau',
- 'nb': 'nob',
- 'nd': 'nde',
- 'ne': 'nep',
- 'ng': 'ndo',
- 'nl': 'nld',
- 'nn': 'nno',
- 'no': 'nor',
- 'nr': 'nbl',
- 'nv': 'nav',
- 'ny': 'nya',
- 'oc': 'oci',
- 'oj': 'oji',
- 'om': 'orm',
- 'or': 'ori',
- 'os': 'oss',
- 'pa': 'pan',
- 'pi': 'pli',
- 'pl': 'pol',
- 'ps': 'pus',
- 'pt': 'por',
- 'qu': 'que',
- 'rm': 'roh',
- 'rn': 'run',
- 'ro': 'ron',
- 'ru': 'rus',
- 'rw': 'kin',
- 'sa': 'san',
- 'sc': 'srd',
- 'sd': 'snd',
- 'se': 'sme',
- 'sg': 'sag',
- 'si': 'sin',
- 'sk': 'slk',
- 'sl': 'slv',
- 'sm': 'smo',
- 'sn': 'sna',
- 'so': 'som',
- 'sq': 'sqi',
- 'sr': 'srp',
- 'ss': 'ssw',
- 'st': 'sot',
- 'su': 'sun',
- 'sv': 'swe',
- 'sw': 'swa',
- 'ta': 'tam',
- 'te': 'tel',
- 'tg': 'tgk',
- 'th': 'tha',
- 'ti': 'tir',
- 'tk': 'tuk',
- 'tl': 'tgl',
- 'tn': 'tsn',
- 'to': 'ton',
- 'tr': 'tur',
- 'ts': 'tso',
- 'tt': 'tat',
- 'tw': 'twi',
- 'ty': 'tah',
- 'ug': 'uig',
- 'uk': 'ukr',
- 'ur': 'urd',
- 'uz': 'uzb',
- 've': 'ven',
- 'vi': 'vie',
- 'vo': 'vol',
- 'wa': 'wln',
- 'wo': 'wol',
- 'xh': 'xho',
- 'yi': 'yid',
- 'yo': 'yor',
- 'za': 'zha',
- 'zh': 'zho',
- 'zu': 'zul',
- }
-
- @classmethod
- def _conver_lang_code(cls, code):
- """Convert language code from ISO 639-1 to ISO 639-2/T"""
- return cls._lang_map.get(code[:2])
-
def run(self, information):
if information['ext'] not in ['mp4', 'mkv']:
self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
@@ -525,7 +338,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
opts += ['-c:s', 'mov_text']
for (i, lang) in enumerate(sub_langs):
opts.extend(['-map', '%d:0' % (i + 1)])
- lang_code = self._conver_lang_code(lang)
+ lang_code = ISO639Utils.short2long(lang)
if lang_code is not None:
opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
@@ -591,6 +404,23 @@ class FFmpegMergerPP(FFmpegPostProcessor):
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
return info['__files_to_merge'], info
+ def can_merge(self):
+ # TODO: figure out merge-capable ffmpeg version
+ if self.basename != 'avconv':
+ return True
+
+ required_version = '10-0'
+ if is_outdated_version(
+ self._versions[self.basename], required_version):
+ warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, '
+ 'youtube-dl will download single file media. '
+ 'Update %s to version %s or newer to fix this.') % (
+ self.basename, self.basename, required_version)
+ if self._downloader:
+ self._downloader.report_warning(warning)
+ return False
+ return True
+
class FFmpegFixupStretchedPP(FFmpegPostProcessor):
def run(self, info):
diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py
index 93d0abcf6..7d88e1308 100644
--- a/youtube_dl/postprocessor/xattrpp.py
+++ b/youtube_dl/postprocessor/xattrpp.py
@@ -3,18 +3,34 @@ from __future__ import unicode_literals
import os
import subprocess
import sys
+import errno
from .common import PostProcessor
-from ..compat import (
- subprocess_check_output
-)
from ..utils import (
check_executable,
hyphenate_date,
version_tuple,
+ PostProcessingError,
+ encodeArgument,
+ encodeFilename,
)
+class XAttrMetadataError(PostProcessingError):
+ def __init__(self, code=None, msg='Unknown error'):
+ super(XAttrMetadataError, self).__init__(msg)
+ self.code = code
+
+ # Parsing code and msg
+ if (self.code in (errno.ENOSPC, errno.EDQUOT) or
+ 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
+ self.reason = 'NO_SPACE'
+ elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
+ self.reason = 'VALUE_TOO_LONG'
+ else:
+ self.reason = 'NOT_SUPPORTED'
+
+
class XAttrMetadataPP(PostProcessor):
#
@@ -51,7 +67,10 @@ class XAttrMetadataPP(PostProcessor):
raise ImportError
def write_xattr(path, key, value):
- return xattr.setxattr(path, key, value)
+ try:
+ xattr.set(path, key, value)
+ except EnvironmentError as e:
+ raise XAttrMetadataError(e.errno, e.strerror)
except ImportError:
if os.name == 'nt':
@@ -62,8 +81,11 @@ class XAttrMetadataPP(PostProcessor):
assert os.path.exists(path)
ads_fn = path + ":" + key
- with open(ads_fn, "wb") as f:
- f.write(value)
+ try:
+ with open(ads_fn, "wb") as f:
+ f.write(value)
+ except EnvironmentError as e:
+ raise XAttrMetadataError(e.errno, e.strerror)
else:
user_has_setfattr = check_executable("setfattr", ['--version'])
user_has_xattr = check_executable("xattr", ['-h'])
@@ -71,12 +93,27 @@ class XAttrMetadataPP(PostProcessor):
if user_has_setfattr or user_has_xattr:
def write_xattr(path, key, value):
+ value = value.decode('utf-8')
if user_has_setfattr:
- cmd = ['setfattr', '-n', key, '-v', value, path]
+ executable = 'setfattr'
+ opts = ['-n', key, '-v', value]
elif user_has_xattr:
- cmd = ['xattr', '-w', key, value, path]
-
- subprocess_check_output(cmd)
+ executable = 'xattr'
+ opts = ['-w', key, value]
+
+ cmd = ([encodeFilename(executable, True)] +
+ [encodeArgument(o) for o in opts] +
+ [encodeFilename(path, True)])
+
+ try:
+ p = subprocess.Popen(
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+ except EnvironmentError as e:
+ raise XAttrMetadataError(e.errno, e.strerror)
+ stdout, stderr = p.communicate()
+ stderr = stderr.decode('utf-8', 'replace')
+ if p.returncode != 0:
+ raise XAttrMetadataError(p.returncode, stderr)
else:
# On Unix, and can't find pyxattr, setfattr, or xattr.
@@ -121,6 +158,19 @@ class XAttrMetadataPP(PostProcessor):
return [], info
- except (subprocess.CalledProcessError, OSError):
- self._downloader.report_error("This filesystem doesn't support extended attributes. (You may have to enable them in your /etc/fstab)")
+ except XAttrMetadataError as e:
+ if e.reason == 'NO_SPACE':
+ self._downloader.report_warning(
+ 'There\'s no disk space left or disk quota exceeded. ' +
+ 'Extended attributes are not written.')
+ elif e.reason == 'VALUE_TOO_LONG':
+ self._downloader.report_warning(
+ 'Unable to write extended attributes due to too long values.')
+ else:
+ msg = 'This filesystem doesn\'t support extended attributes. '
+ if os.name == 'nt':
+ msg += 'You need to use NTFS.'
+ else:
+ msg += '(You may have to enable them in your /etc/fstab)'
+ self._downloader.report_error(msg)
return [], info
diff --git a/youtube_dl/update.py b/youtube_dl/update.py
index de3169eef..fc7ac8305 100644
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -50,7 +50,7 @@ def rsa_verify(message, signature, key):
def update_self(to_screen, verbose):
"""Update the program file with the latest version from the repository"""
- UPDATE_URL = "http://rg3.github.io/youtube-dl/update/"
+ UPDATE_URL = "https://rg3.github.io/youtube-dl/update/"
VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
JSON_URL = UPDATE_URL + 'versions.json'
UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index a5a5c317e..e265c7574 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -62,6 +62,8 @@ std_headers = {
}
+NO_DEFAULT = object()
+
ENGLISH_MONTH_NAMES = [
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
@@ -137,21 +139,24 @@ def write_json_file(obj, fn):
if sys.version_info >= (2, 7):
- def find_xpath_attr(node, xpath, key, val):
+ def find_xpath_attr(node, xpath, key, val=None):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z-]+$', key)
- assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
- expr = xpath + "[@%s='%s']" % (key, val)
+ if val:
+ assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
+ expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
return node.find(expr)
else:
- def find_xpath_attr(node, xpath, key, val):
+ def find_xpath_attr(node, xpath, key, val=None):
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . !
if isinstance(xpath, compat_str):
xpath = xpath.encode('ascii')
for f in node.findall(xpath):
- if f.attrib.get(key) == val:
+ if key not in f.attrib:
+ continue
+ if val is None or f.attrib.get(key) == val:
return f
return None
@@ -171,13 +176,15 @@ def xpath_with_ns(path, ns_map):
return '/'.join(replaced)
-def xpath_text(node, xpath, name=None, fatal=False):
+def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
if sys.version_info < (2, 7): # Crazy 2.6
xpath = xpath.encode('ascii')
n = node.find(xpath)
if n is None or n.text is None:
- if fatal:
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
name = xpath if name is None else name
raise ExtractorError('Could not find XML element %s' % name)
else:
@@ -327,13 +334,6 @@ def sanitize_path(s):
return os.path.join(*sanitized_path)
-def sanitize_url_path_consecutive_slashes(url):
- """Collapses consecutive slashes in URLs' path"""
- parsed_url = list(compat_urlparse.urlparse(url))
- parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
- return compat_urlparse.urlunparse(parsed_url)
-
-
def orderedSet(iterable):
""" Remove all duplicates from the input iterable """
res = []
@@ -579,11 +579,9 @@ class ContentTooShortError(Exception):
download is too small for what the server announced first, indicating
the connection was probably interrupted.
"""
- # Both in bytes
- downloaded = None
- expected = None
def __init__(self, downloaded, expected):
+ # Both in bytes
self.downloaded = downloaded
self.expected = expected
@@ -653,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
return ret
def http_request(self, req):
+ # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+ # always respected by websites, some tend to give out URLs with non percent-encoded
+ # non-ASCII characters (see telemb.py, ard.py [#3412])
+ # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+ # To work around aforementioned issue we will replace request's original URL with
+ # percent-encoded one
+ # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
+ # the code of this workaround has been moved here from YoutubeDL.urlopen()
+ url = req.get_full_url()
+ url_escaped = escape_url(url)
+
+ # Substitute URL if any change after escaping
+ if url != url_escaped:
+ req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+ new_req = req_type(
+ url_escaped, data=req.data, headers=req.headers,
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+ new_req.timeout = req.timeout
+ req = new_req
+
for h, v in std_headers.items():
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
# The dict keys are capitalized because of this bug by urllib
@@ -697,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
gz = io.BytesIO(self.deflate(resp.read()))
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
+ # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986
+ if 300 <= resp.code < 400:
+ location = resp.headers.get('Location')
+ if location:
+ # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
+ if sys.version_info >= (3, 0):
+ location = location.encode('iso-8859-1').decode('utf-8')
+ location_escaped = escape_url(location)
+ if location != location_escaped:
+ del resp.headers['Location']
+ resp.headers['Location'] = location_escaped
return resp
https_request = http_request
@@ -1312,10 +1341,10 @@ def parse_duration(s):
m = re.match(
r'''(?ix)(?:P?T)?
(?:
- (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
+ (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
(?P<only_hours>[0-9.]+)\s*(?:hours?)|
- \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
+ \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
(?:
(?:
(?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
@@ -1380,7 +1409,7 @@ def get_exe_version(exe, args=['--version'],
or False if the executable is not present """
try:
out, _ = subprocess.Popen(
- [exe] + args,
+ [encodeArgument(exe)] + args,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
except OSError:
return False
@@ -1486,6 +1515,14 @@ def uppercase_escape(s):
s)
+def lowercase_escape(s):
+ unicode_escape = codecs.getdecoder('unicode_escape')
+ return re.sub(
+ r'\\u[0-9a-fA-F]{4}',
+ lambda m: unicode_escape(m.group(0))[0],
+ s)
+
+
def escape_rfc3986(s):
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, compat_str):
@@ -1664,6 +1701,7 @@ def mimetype2ext(mt):
return {
'x-ms-wmv': 'wmv',
'x-mp4-fragmented': 'mp4',
+ 'ttml+xml': 'ttml',
}.get(res, res)
@@ -1834,16 +1872,15 @@ def parse_dfxp_time_expr(time_expr):
return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
-def format_srt_time(seconds):
- (mins, secs) = divmod(seconds, 60)
- (hours, mins) = divmod(mins, 60)
- millisecs = (secs - int(secs)) * 1000
- secs = int(secs)
- return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+def srt_subtitles_timecode(seconds):
+ return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
def dfxp2srt(dfxp_data):
- _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+ _x = functools.partial(xpath_with_ns, ns_map={
+ 'ttml': 'http://www.w3.org/ns/ttml',
+ 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+ })
def parse_node(node):
str_or_empty = functools.partial(str_or_none, default='')
@@ -1851,9 +1888,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text)
for child in node:
- if child.tag == _x('ttml:br'):
+ if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
out += '\n' + str_or_empty(child.tail)
- elif child.tag == _x('ttml:span'):
+ elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
out += str_or_empty(parse_node(child))
else:
out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1862,18 +1899,487 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p'))
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
+
+ if not paras:
+ raise ValueError('Invalid dfxp/TTML subtitle')
for para, index in zip(paras, itertools.count(1)):
+ begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+ end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+ if not end_time:
+ end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
out.append('%d\n%s --> %s\n%s\n\n' % (
index,
- format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
- format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+ srt_subtitles_timecode(begin_time),
+ srt_subtitles_timecode(end_time),
parse_node(para)))
return ''.join(out)
+class ISO639Utils(object):
+ # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+ _lang_map = {
+ 'aa': 'aar',
+ 'ab': 'abk',
+ 'ae': 'ave',
+ 'af': 'afr',
+ 'ak': 'aka',
+ 'am': 'amh',
+ 'an': 'arg',
+ 'ar': 'ara',
+ 'as': 'asm',
+ 'av': 'ava',
+ 'ay': 'aym',
+ 'az': 'aze',
+ 'ba': 'bak',
+ 'be': 'bel',
+ 'bg': 'bul',
+ 'bh': 'bih',
+ 'bi': 'bis',
+ 'bm': 'bam',
+ 'bn': 'ben',
+ 'bo': 'bod',
+ 'br': 'bre',
+ 'bs': 'bos',
+ 'ca': 'cat',
+ 'ce': 'che',
+ 'ch': 'cha',
+ 'co': 'cos',
+ 'cr': 'cre',
+ 'cs': 'ces',
+ 'cu': 'chu',
+ 'cv': 'chv',
+ 'cy': 'cym',
+ 'da': 'dan',
+ 'de': 'deu',
+ 'dv': 'div',
+ 'dz': 'dzo',
+ 'ee': 'ewe',
+ 'el': 'ell',
+ 'en': 'eng',
+ 'eo': 'epo',
+ 'es': 'spa',
+ 'et': 'est',
+ 'eu': 'eus',
+ 'fa': 'fas',
+ 'ff': 'ful',
+ 'fi': 'fin',
+ 'fj': 'fij',
+ 'fo': 'fao',
+ 'fr': 'fra',
+ 'fy': 'fry',
+ 'ga': 'gle',
+ 'gd': 'gla',
+ 'gl': 'glg',
+ 'gn': 'grn',
+ 'gu': 'guj',
+ 'gv': 'glv',
+ 'ha': 'hau',
+ 'he': 'heb',
+ 'hi': 'hin',
+ 'ho': 'hmo',
+ 'hr': 'hrv',
+ 'ht': 'hat',
+ 'hu': 'hun',
+ 'hy': 'hye',
+ 'hz': 'her',
+ 'ia': 'ina',
+ 'id': 'ind',
+ 'ie': 'ile',
+ 'ig': 'ibo',
+ 'ii': 'iii',
+ 'ik': 'ipk',
+ 'io': 'ido',
+ 'is': 'isl',
+ 'it': 'ita',
+ 'iu': 'iku',
+ 'ja': 'jpn',
+ 'jv': 'jav',
+ 'ka': 'kat',
+ 'kg': 'kon',
+ 'ki': 'kik',
+ 'kj': 'kua',
+ 'kk': 'kaz',
+ 'kl': 'kal',
+ 'km': 'khm',
+ 'kn': 'kan',
+ 'ko': 'kor',
+ 'kr': 'kau',
+ 'ks': 'kas',
+ 'ku': 'kur',
+ 'kv': 'kom',
+ 'kw': 'cor',
+ 'ky': 'kir',
+ 'la': 'lat',
+ 'lb': 'ltz',
+ 'lg': 'lug',
+ 'li': 'lim',
+ 'ln': 'lin',
+ 'lo': 'lao',
+ 'lt': 'lit',
+ 'lu': 'lub',
+ 'lv': 'lav',
+ 'mg': 'mlg',
+ 'mh': 'mah',
+ 'mi': 'mri',
+ 'mk': 'mkd',
+ 'ml': 'mal',
+ 'mn': 'mon',
+ 'mr': 'mar',
+ 'ms': 'msa',
+ 'mt': 'mlt',
+ 'my': 'mya',
+ 'na': 'nau',
+ 'nb': 'nob',
+ 'nd': 'nde',
+ 'ne': 'nep',
+ 'ng': 'ndo',
+ 'nl': 'nld',
+ 'nn': 'nno',
+ 'no': 'nor',
+ 'nr': 'nbl',
+ 'nv': 'nav',
+ 'ny': 'nya',
+ 'oc': 'oci',
+ 'oj': 'oji',
+ 'om': 'orm',
+ 'or': 'ori',
+ 'os': 'oss',
+ 'pa': 'pan',
+ 'pi': 'pli',
+ 'pl': 'pol',
+ 'ps': 'pus',
+ 'pt': 'por',
+ 'qu': 'que',
+ 'rm': 'roh',
+ 'rn': 'run',
+ 'ro': 'ron',
+ 'ru': 'rus',
+ 'rw': 'kin',
+ 'sa': 'san',
+ 'sc': 'srd',
+ 'sd': 'snd',
+ 'se': 'sme',
+ 'sg': 'sag',
+ 'si': 'sin',
+ 'sk': 'slk',
+ 'sl': 'slv',
+ 'sm': 'smo',
+ 'sn': 'sna',
+ 'so': 'som',
+ 'sq': 'sqi',
+ 'sr': 'srp',
+ 'ss': 'ssw',
+ 'st': 'sot',
+ 'su': 'sun',
+ 'sv': 'swe',
+ 'sw': 'swa',
+ 'ta': 'tam',
+ 'te': 'tel',
+ 'tg': 'tgk',
+ 'th': 'tha',
+ 'ti': 'tir',
+ 'tk': 'tuk',
+ 'tl': 'tgl',
+ 'tn': 'tsn',
+ 'to': 'ton',
+ 'tr': 'tur',
+ 'ts': 'tso',
+ 'tt': 'tat',
+ 'tw': 'twi',
+ 'ty': 'tah',
+ 'ug': 'uig',
+ 'uk': 'ukr',
+ 'ur': 'urd',
+ 'uz': 'uzb',
+ 've': 'ven',
+ 'vi': 'vie',
+ 'vo': 'vol',
+ 'wa': 'wln',
+ 'wo': 'wol',
+ 'xh': 'xho',
+ 'yi': 'yid',
+ 'yo': 'yor',
+ 'za': 'zha',
+ 'zh': 'zho',
+ 'zu': 'zul',
+ }
+
+ @classmethod
+ def short2long(cls, code):
+ """Convert language code from ISO 639-1 to ISO 639-2/T"""
+ return cls._lang_map.get(code[:2])
+
+ @classmethod
+ def long2short(cls, code):
+ """Convert language code from ISO 639-2/T to ISO 639-1"""
+ for short_name, long_name in cls._lang_map.items():
+ if long_name == code:
+ return short_name
+
+
+class ISO3166Utils(object):
+ # From http://data.okfn.org/data/core/country-list
+ _country_map = {
+ 'AF': 'Afghanistan',
+ 'AX': 'Åland Islands',
+ 'AL': 'Albania',
+ 'DZ': 'Algeria',
+ 'AS': 'American Samoa',
+ 'AD': 'Andorra',
+ 'AO': 'Angola',
+ 'AI': 'Anguilla',
+ 'AQ': 'Antarctica',
+ 'AG': 'Antigua and Barbuda',
+ 'AR': 'Argentina',
+ 'AM': 'Armenia',
+ 'AW': 'Aruba',
+ 'AU': 'Australia',
+ 'AT': 'Austria',
+ 'AZ': 'Azerbaijan',
+ 'BS': 'Bahamas',
+ 'BH': 'Bahrain',
+ 'BD': 'Bangladesh',
+ 'BB': 'Barbados',
+ 'BY': 'Belarus',
+ 'BE': 'Belgium',
+ 'BZ': 'Belize',
+ 'BJ': 'Benin',
+ 'BM': 'Bermuda',
+ 'BT': 'Bhutan',
+ 'BO': 'Bolivia, Plurinational State of',
+ 'BQ': 'Bonaire, Sint Eustatius and Saba',
+ 'BA': 'Bosnia and Herzegovina',
+ 'BW': 'Botswana',
+ 'BV': 'Bouvet Island',
+ 'BR': 'Brazil',
+ 'IO': 'British Indian Ocean Territory',
+ 'BN': 'Brunei Darussalam',
+ 'BG': 'Bulgaria',
+ 'BF': 'Burkina Faso',
+ 'BI': 'Burundi',
+ 'KH': 'Cambodia',
+ 'CM': 'Cameroon',
+ 'CA': 'Canada',
+ 'CV': 'Cape Verde',
+ 'KY': 'Cayman Islands',
+ 'CF': 'Central African Republic',
+ 'TD': 'Chad',
+ 'CL': 'Chile',
+ 'CN': 'China',
+ 'CX': 'Christmas Island',
+ 'CC': 'Cocos (Keeling) Islands',
+ 'CO': 'Colombia',
+ 'KM': 'Comoros',
+ 'CG': 'Congo',
+ 'CD': 'Congo, the Democratic Republic of the',
+ 'CK': 'Cook Islands',
+ 'CR': 'Costa Rica',
+ 'CI': 'Côte d\'Ivoire',
+ 'HR': 'Croatia',
+ 'CU': 'Cuba',
+ 'CW': 'Curaçao',
+ 'CY': 'Cyprus',
+ 'CZ': 'Czech Republic',
+ 'DK': 'Denmark',
+ 'DJ': 'Djibouti',
+ 'DM': 'Dominica',
+ 'DO': 'Dominican Republic',
+ 'EC': 'Ecuador',
+ 'EG': 'Egypt',
+ 'SV': 'El Salvador',
+ 'GQ': 'Equatorial Guinea',
+ 'ER': 'Eritrea',
+ 'EE': 'Estonia',
+ 'ET': 'Ethiopia',
+ 'FK': 'Falkland Islands (Malvinas)',
+ 'FO': 'Faroe Islands',
+ 'FJ': 'Fiji',
+ 'FI': 'Finland',
+ 'FR': 'France',
+ 'GF': 'French Guiana',
+ 'PF': 'French Polynesia',
+ 'TF': 'French Southern Territories',
+ 'GA': 'Gabon',
+ 'GM': 'Gambia',
+ 'GE': 'Georgia',
+ 'DE': 'Germany',
+ 'GH': 'Ghana',
+ 'GI': 'Gibraltar',
+ 'GR': 'Greece',
+ 'GL': 'Greenland',
+ 'GD': 'Grenada',
+ 'GP': 'Guadeloupe',
+ 'GU': 'Guam',
+ 'GT': 'Guatemala',
+ 'GG': 'Guernsey',
+ 'GN': 'Guinea',
+ 'GW': 'Guinea-Bissau',
+ 'GY': 'Guyana',
+ 'HT': 'Haiti',
+ 'HM': 'Heard Island and McDonald Islands',
+ 'VA': 'Holy See (Vatican City State)',
+ 'HN': 'Honduras',
+ 'HK': 'Hong Kong',
+ 'HU': 'Hungary',
+ 'IS': 'Iceland',
+ 'IN': 'India',
+ 'ID': 'Indonesia',
+ 'IR': 'Iran, Islamic Republic of',
+ 'IQ': 'Iraq',
+ 'IE': 'Ireland',
+ 'IM': 'Isle of Man',
+ 'IL': 'Israel',
+ 'IT': 'Italy',
+ 'JM': 'Jamaica',
+ 'JP': 'Japan',
+ 'JE': 'Jersey',
+ 'JO': 'Jordan',
+ 'KZ': 'Kazakhstan',
+ 'KE': 'Kenya',
+ 'KI': 'Kiribati',
+ 'KP': 'Korea, Democratic People\'s Republic of',
+ 'KR': 'Korea, Republic of',
+ 'KW': 'Kuwait',
+ 'KG': 'Kyrgyzstan',
+ 'LA': 'Lao People\'s Democratic Republic',
+ 'LV': 'Latvia',
+ 'LB': 'Lebanon',
+ 'LS': 'Lesotho',
+ 'LR': 'Liberia',
+ 'LY': 'Libya',
+ 'LI': 'Liechtenstein',
+ 'LT': 'Lithuania',
+ 'LU': 'Luxembourg',
+ 'MO': 'Macao',
+ 'MK': 'Macedonia, the Former Yugoslav Republic of',
+ 'MG': 'Madagascar',
+ 'MW': 'Malawi',
+ 'MY': 'Malaysia',
+ 'MV': 'Maldives',
+ 'ML': 'Mali',
+ 'MT': 'Malta',
+ 'MH': 'Marshall Islands',
+ 'MQ': 'Martinique',
+ 'MR': 'Mauritania',
+ 'MU': 'Mauritius',
+ 'YT': 'Mayotte',
+ 'MX': 'Mexico',
+ 'FM': 'Micronesia, Federated States of',
+ 'MD': 'Moldova, Republic of',
+ 'MC': 'Monaco',
+ 'MN': 'Mongolia',
+ 'ME': 'Montenegro',
+ 'MS': 'Montserrat',
+ 'MA': 'Morocco',
+ 'MZ': 'Mozambique',
+ 'MM': 'Myanmar',
+ 'NA': 'Namibia',
+ 'NR': 'Nauru',
+ 'NP': 'Nepal',
+ 'NL': 'Netherlands',
+ 'NC': 'New Caledonia',
+ 'NZ': 'New Zealand',
+ 'NI': 'Nicaragua',
+ 'NE': 'Niger',
+ 'NG': 'Nigeria',
+ 'NU': 'Niue',
+ 'NF': 'Norfolk Island',
+ 'MP': 'Northern Mariana Islands',
+ 'NO': 'Norway',
+ 'OM': 'Oman',
+ 'PK': 'Pakistan',
+ 'PW': 'Palau',
+ 'PS': 'Palestine, State of',
+ 'PA': 'Panama',
+ 'PG': 'Papua New Guinea',
+ 'PY': 'Paraguay',
+ 'PE': 'Peru',
+ 'PH': 'Philippines',
+ 'PN': 'Pitcairn',
+ 'PL': 'Poland',
+ 'PT': 'Portugal',
+ 'PR': 'Puerto Rico',
+ 'QA': 'Qatar',
+ 'RE': 'Réunion',
+ 'RO': 'Romania',
+ 'RU': 'Russian Federation',
+ 'RW': 'Rwanda',
+ 'BL': 'Saint Barthélemy',
+ 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
+ 'KN': 'Saint Kitts and Nevis',
+ 'LC': 'Saint Lucia',
+ 'MF': 'Saint Martin (French part)',
+ 'PM': 'Saint Pierre and Miquelon',
+ 'VC': 'Saint Vincent and the Grenadines',
+ 'WS': 'Samoa',
+ 'SM': 'San Marino',
+ 'ST': 'Sao Tome and Principe',
+ 'SA': 'Saudi Arabia',
+ 'SN': 'Senegal',
+ 'RS': 'Serbia',
+ 'SC': 'Seychelles',
+ 'SL': 'Sierra Leone',
+ 'SG': 'Singapore',
+ 'SX': 'Sint Maarten (Dutch part)',
+ 'SK': 'Slovakia',
+ 'SI': 'Slovenia',
+ 'SB': 'Solomon Islands',
+ 'SO': 'Somalia',
+ 'ZA': 'South Africa',
+ 'GS': 'South Georgia and the South Sandwich Islands',
+ 'SS': 'South Sudan',
+ 'ES': 'Spain',
+ 'LK': 'Sri Lanka',
+ 'SD': 'Sudan',
+ 'SR': 'Suriname',
+ 'SJ': 'Svalbard and Jan Mayen',
+ 'SZ': 'Swaziland',
+ 'SE': 'Sweden',
+ 'CH': 'Switzerland',
+ 'SY': 'Syrian Arab Republic',
+ 'TW': 'Taiwan, Province of China',
+ 'TJ': 'Tajikistan',
+ 'TZ': 'Tanzania, United Republic of',
+ 'TH': 'Thailand',
+ 'TL': 'Timor-Leste',
+ 'TG': 'Togo',
+ 'TK': 'Tokelau',
+ 'TO': 'Tonga',
+ 'TT': 'Trinidad and Tobago',
+ 'TN': 'Tunisia',
+ 'TR': 'Turkey',
+ 'TM': 'Turkmenistan',
+ 'TC': 'Turks and Caicos Islands',
+ 'TV': 'Tuvalu',
+ 'UG': 'Uganda',
+ 'UA': 'Ukraine',
+ 'AE': 'United Arab Emirates',
+ 'GB': 'United Kingdom',
+ 'US': 'United States',
+ 'UM': 'United States Minor Outlying Islands',
+ 'UY': 'Uruguay',
+ 'UZ': 'Uzbekistan',
+ 'VU': 'Vanuatu',
+ 'VE': 'Venezuela, Bolivarian Republic of',
+ 'VN': 'Viet Nam',
+ 'VG': 'Virgin Islands, British',
+ 'VI': 'Virgin Islands, U.S.',
+ 'WF': 'Wallis and Futuna',
+ 'EH': 'Western Sahara',
+ 'YE': 'Yemen',
+ 'ZM': 'Zambia',
+ 'ZW': 'Zimbabwe',
+ }
+
+ @classmethod
+ def short2full(cls, code):
+ """Convert an ISO 3166-2 country code to the corresponding full name"""
+ return cls._country_map.get(code.upper())
+
+
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
def __init__(self, proxies=None):
# Set default handlers
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index b88ea85e8..a07bc9233 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2015.04.28'
+__version__ = '2015.08.28'