aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py19
-rwxr-xr-xyoutube_dl/__main__.py2
-rw-r--r--youtube_dl/compat.py121
-rw-r--r--youtube_dl/downloader/common.py2
-rw-r--r--youtube_dl/downloader/f4m.py22
-rw-r--r--youtube_dl/downloader/hls.py17
-rw-r--r--youtube_dl/downloader/rtmp.py2
-rw-r--r--youtube_dl/extractor/__init__.py41
-rw-r--r--youtube_dl/extractor/abc.py18
-rw-r--r--youtube_dl/extractor/adultswim.py46
-rw-r--r--youtube_dl/extractor/anitube.py4
-rw-r--r--youtube_dl/extractor/appletrailers.py68
-rw-r--r--youtube_dl/extractor/ard.py4
-rw-r--r--youtube_dl/extractor/arte.py12
-rw-r--r--youtube_dl/extractor/bandcamp.py12
-rw-r--r--youtube_dl/extractor/bbc.py271
-rw-r--r--youtube_dl/extractor/beeg.py70
-rw-r--r--youtube_dl/extractor/bild.py22
-rw-r--r--youtube_dl/extractor/bilibili.py6
-rw-r--r--youtube_dl/extractor/brightcove.py4
-rw-r--r--youtube_dl/extractor/canalc2.py43
-rw-r--r--youtube_dl/extractor/canalplus.py3
-rw-r--r--youtube_dl/extractor/cbs.py12
-rw-r--r--youtube_dl/extractor/cbsnews.py5
-rw-r--r--youtube_dl/extractor/channel9.py66
-rw-r--r--youtube_dl/extractor/chaturbate.py50
-rw-r--r--youtube_dl/extractor/clubic.py9
-rw-r--r--youtube_dl/extractor/clyp.py57
-rw-r--r--youtube_dl/extractor/cmt.py5
-rw-r--r--youtube_dl/extractor/comedycentral.py7
-rw-r--r--youtube_dl/extractor/common.py79
-rw-r--r--youtube_dl/extractor/condenast.py50
-rw-r--r--youtube_dl/extractor/criterion.py4
-rw-r--r--youtube_dl/extractor/crunchyroll.py97
-rw-r--r--youtube_dl/extractor/dailymotion.py33
-rw-r--r--youtube_dl/extractor/democracynow.py88
-rw-r--r--youtube_dl/extractor/divxstage.py27
-rw-r--r--youtube_dl/extractor/eagleplatform.py29
-rw-r--r--youtube_dl/extractor/eitb.py95
-rw-r--r--youtube_dl/extractor/engadget.py2
-rw-r--r--youtube_dl/extractor/europa.py93
-rw-r--r--youtube_dl/extractor/expotv.py31
-rw-r--r--youtube_dl/extractor/extremetube.py59
-rw-r--r--youtube_dl/extractor/facebook.py23
-rw-r--r--youtube_dl/extractor/fczenit.py41
-rw-r--r--youtube_dl/extractor/fivemin.py84
-rw-r--r--youtube_dl/extractor/fktv.py89
-rw-r--r--youtube_dl/extractor/fourtube.py29
-rw-r--r--youtube_dl/extractor/francetv.py28
-rw-r--r--youtube_dl/extractor/funnyordie.py15
-rw-r--r--youtube_dl/extractor/generic.py41
-rw-r--r--youtube_dl/extractor/globo.py161
-rw-r--r--youtube_dl/extractor/googleplus.py2
-rw-r--r--youtube_dl/extractor/hostingbulk.py80
-rw-r--r--youtube_dl/extractor/iconosquare.py24
-rw-r--r--youtube_dl/extractor/imdb.py29
-rw-r--r--youtube_dl/extractor/iqiyi.py20
-rw-r--r--youtube_dl/extractor/jeuxvideo.py2
-rw-r--r--youtube_dl/extractor/kaltura.py2
-rw-r--r--youtube_dl/extractor/keek.py39
-rw-r--r--youtube_dl/extractor/kuwo.py5
-rw-r--r--youtube_dl/extractor/letv.py70
-rw-r--r--youtube_dl/extractor/limelight.py229
-rw-r--r--youtube_dl/extractor/lynda.py74
-rw-r--r--youtube_dl/extractor/mdr.py189
-rw-r--r--youtube_dl/extractor/megavideoz.py56
-rw-r--r--youtube_dl/extractor/miomio.py13
-rw-r--r--youtube_dl/extractor/mit.py2
-rw-r--r--youtube_dl/extractor/mitele.py103
-rw-r--r--youtube_dl/extractor/moniker.py47
-rw-r--r--youtube_dl/extractor/movieclips.py78
-rw-r--r--youtube_dl/extractor/mtv.py15
-rw-r--r--youtube_dl/extractor/naver.py11
-rw-r--r--youtube_dl/extractor/ndr.py445
-rw-r--r--youtube_dl/extractor/nextmedia.py16
-rw-r--r--youtube_dl/extractor/nfl.py164
-rw-r--r--youtube_dl/extractor/nhl.py26
-rw-r--r--youtube_dl/extractor/ninegag.py95
-rw-r--r--youtube_dl/extractor/novamov.py43
-rw-r--r--youtube_dl/extractor/nowness.py166
-rw-r--r--youtube_dl/extractor/nowtv.py4
-rw-r--r--youtube_dl/extractor/nowvideo.py4
-rw-r--r--youtube_dl/extractor/nrk.py30
-rw-r--r--youtube_dl/extractor/odnoklassniki.py13
-rw-r--r--youtube_dl/extractor/openfilm.py70
-rw-r--r--youtube_dl/extractor/pbs.py87
-rw-r--r--youtube_dl/extractor/periscope.py13
-rw-r--r--youtube_dl/extractor/playwire.py2
-rw-r--r--youtube_dl/extractor/pornhub.py5
-rw-r--r--youtube_dl/extractor/prosiebensat1.py2
-rw-r--r--youtube_dl/extractor/qqmusic.py37
-rw-r--r--youtube_dl/extractor/rai.py22
-rw-r--r--youtube_dl/extractor/rtbf.py15
-rw-r--r--youtube_dl/extractor/rte.py12
-rw-r--r--youtube_dl/extractor/ruutu.py17
-rw-r--r--youtube_dl/extractor/senateisvp.py4
-rw-r--r--youtube_dl/extractor/shahid.py4
-rw-r--r--youtube_dl/extractor/soundcloud.py2
-rw-r--r--youtube_dl/extractor/spiegeltv.py18
-rw-r--r--youtube_dl/extractor/stitcher.py81
-rw-r--r--youtube_dl/extractor/tapely.py6
-rw-r--r--youtube_dl/extractor/telecinco.py79
-rw-r--r--youtube_dl/extractor/tudou.py38
-rw-r--r--youtube_dl/extractor/tumblr.py57
-rw-r--r--youtube_dl/extractor/tutv.py4
-rw-r--r--youtube_dl/extractor/twitch.py34
-rw-r--r--youtube_dl/extractor/twitter.py170
-rw-r--r--youtube_dl/extractor/ustream.py92
-rw-r--r--youtube_dl/extractor/vevo.py6
-rw-r--r--youtube_dl/extractor/videofyme.py40
-rw-r--r--youtube_dl/extractor/videolecturesnet.py86
-rw-r--r--youtube_dl/extractor/vidme.py57
-rw-r--r--youtube_dl/extractor/vidzi.py10
-rw-r--r--youtube_dl/extractor/viewster.py64
-rw-r--r--youtube_dl/extractor/viidea.py188
-rw-r--r--youtube_dl/extractor/vimeo.py130
-rw-r--r--youtube_dl/extractor/vine.py70
-rw-r--r--youtube_dl/extractor/vk.py15
-rw-r--r--youtube_dl/extractor/wsj.py1
-rw-r--r--youtube_dl/extractor/xfileshare.py (renamed from youtube_dl/extractor/gorillavid.py)30
-rw-r--r--youtube_dl/extractor/xhamster.py4
-rw-r--r--youtube_dl/extractor/yandexmusic.py7
-rw-r--r--youtube_dl/extractor/youporn.py218
-rw-r--r--youtube_dl/extractor/youtube.py167
-rw-r--r--youtube_dl/extractor/zdf.py50
-rw-r--r--youtube_dl/extractor/zingmp3.py13
-rw-r--r--youtube_dl/jsinterp.py4
-rw-r--r--youtube_dl/options.py4
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py15
-rw-r--r--youtube_dl/utils.py84
-rw-r--r--youtube_dl/version.py2
131 files changed, 4478 insertions, 1972 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index d65253882..1783ce01b 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -37,6 +37,7 @@ from .compat import (
compat_tokenize_tokenize,
compat_urllib_error,
compat_urllib_request,
+ compat_urllib_request_DataHandler,
)
from .utils import (
ContentTooShortError,
@@ -571,7 +572,7 @@ class YoutubeDL(object):
if v is not None)
template_dict = collections.defaultdict(lambda: 'NA', template_dict)
- outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
+ outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
tmpl = compat_expanduser(outtmpl)
filename = tmpl % template_dict
# Temporary fix for #4787
@@ -579,7 +580,7 @@ class YoutubeDL(object):
# to workaround encoding issues with subprocess on python2 @ Windows
if sys.version_info < (3, 0) and sys.platform == 'win32':
filename = encodeFilename(filename, True).decode(preferredencoding())
- return filename
+ return sanitize_path(filename)
except ValueError as err:
self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
return None
@@ -1232,13 +1233,20 @@ class YoutubeDL(object):
except (ValueError, OverflowError, OSError):
pass
+ subtitles = info_dict.get('subtitles')
+ if subtitles:
+ for _, subtitle in subtitles.items():
+ for subtitle_format in subtitle:
+ if 'ext' not in subtitle_format:
+ subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
+
if self.params.get('listsubtitles', False):
if 'automatic_captions' in info_dict:
self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
- self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
+ self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
return
info_dict['requested_subtitles'] = self.process_subtitles(
- info_dict['id'], info_dict.get('subtitles'),
+ info_dict['id'], subtitles,
info_dict.get('automatic_captions'))
# We now pick which formats have to be downloaded
@@ -1960,8 +1968,9 @@ class YoutubeDL(object):
debuglevel = 1 if self.params.get('debug_printtraffic') else 0
https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
+ data_handler = compat_urllib_request_DataHandler()
opener = compat_urllib_request.build_opener(
- proxy_handler, https_handler, cookie_processor, ydlh)
+ proxy_handler, https_handler, cookie_processor, ydlh, data_handler)
# Delete the default user-agent header, which would otherwise apply in
# cases where our custom HTTP handler doesn't come into play
diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py
index 65a0f891c..42a0f8c6f 100755
--- a/youtube_dl/__main__.py
+++ b/youtube_dl/__main__.py
@@ -11,7 +11,7 @@ if __package__ is None and not hasattr(sys, "frozen"):
# direct call of __main__.py
import os.path
path = os.path.realpath(os.path.abspath(__file__))
- sys.path.append(os.path.dirname(os.path.dirname(path)))
+ sys.path.insert(0, os.path.dirname(os.path.dirname(path)))
import youtube_dl
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index e32bef279..a3e85264a 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -1,7 +1,10 @@
from __future__ import unicode_literals
+import binascii
import collections
+import email
import getpass
+import io
import optparse
import os
import re
@@ -11,6 +14,7 @@ import socket
import subprocess
import sys
import itertools
+import xml.etree.ElementTree
try:
@@ -39,6 +43,11 @@ except ImportError: # Python 2
import urlparse as compat_urlparse
try:
+ import urllib.response as compat_urllib_response
+except ImportError: # Python 2
+ import urllib as compat_urllib_response
+
+try:
import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
import cookielib as compat_cookiejar
@@ -81,6 +90,11 @@ except ImportError:
import BaseHTTPServer as compat_http_server
try:
+ compat_str = unicode # Python 2
+except NameError:
+ compat_str = str
+
+try:
from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
from urllib.parse import unquote as compat_urllib_parse_unquote
from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
@@ -100,7 +114,7 @@ except ImportError: # Python 2
# Is it a string-like object?
string.split
return b''
- if isinstance(string, unicode):
+ if isinstance(string, compat_str):
string = string.encode('utf-8')
bits = string.split(b'%')
if len(bits) == 1:
@@ -151,9 +165,38 @@ except ImportError: # Python 2
return compat_urllib_parse_unquote(string, encoding, errors)
try:
- compat_str = unicode # Python 2
-except NameError:
- compat_str = str
+ from urllib.request import DataHandler as compat_urllib_request_DataHandler
+except ImportError: # Python < 3.4
+ # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py
+ class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler):
+ def data_open(self, req):
+ # data URLs as specified in RFC 2397.
+ #
+ # ignores POSTed data
+ #
+ # syntax:
+ # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
+ # mediatype := [ type "/" subtype ] *( ";" parameter )
+ # data := *urlchar
+ # parameter := attribute "=" value
+ url = req.get_full_url()
+
+ scheme, data = url.split(":", 1)
+ mediatype, data = data.split(",", 1)
+
+ # even base64 encoded data URLs might be quoted so unquote in any case:
+ data = compat_urllib_parse_unquote_to_bytes(data)
+ if mediatype.endswith(";base64"):
+ data = binascii.a2b_base64(data)
+ mediatype = mediatype[:-7]
+
+ if not mediatype:
+ mediatype = "text/plain;charset=US-ASCII"
+
+ headers = email.message_from_string(
+ "Content-type: %s\nContent-length: %d\n" % (mediatype, len(data)))
+
+ return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url)
try:
compat_basestring = basestring # Python 2
@@ -170,6 +213,43 @@ try:
except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error
+if sys.version_info[0] >= 3:
+ compat_etree_fromstring = xml.etree.ElementTree.fromstring
+else:
+ # python 2.x tries to encode unicode strings with ascii (see the
+ # XMLParser._fixtext method)
+ etree = xml.etree.ElementTree
+
+ try:
+ _etree_iter = etree.Element.iter
+ except AttributeError: # Python <=2.6
+ def _etree_iter(root):
+ for el in root.findall('*'):
+ yield el
+ for sub in _etree_iter(el):
+ yield sub
+
+ # on 2.6 XML doesn't have a parser argument, function copied from CPython
+ # 2.7 source
+ def _XML(text, parser=None):
+ if not parser:
+ parser = etree.XMLParser(target=etree.TreeBuilder())
+ parser.feed(text)
+ return parser.close()
+
+ def _element_factory(*args, **kwargs):
+ el = etree.Element(*args, **kwargs)
+ for k, v in el.items():
+ if isinstance(v, bytes):
+ el.set(k, v.decode('utf-8'))
+ return el
+
+ def compat_etree_fromstring(text):
+ doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+ for el in _etree_iter(doc):
+ if el.text is not None and isinstance(el.text, bytes):
+ el.text = el.text.decode('utf-8')
+ return doc
try:
from urllib.parse import parse_qs as compat_parse_qs
@@ -234,7 +314,7 @@ else:
# Working around shlex issue with unicode strings on some python 2
# versions (see http://bugs.python.org/issue1548891)
def compat_shlex_split(s, comments=False, posix=True):
- if isinstance(s, unicode):
+ if isinstance(s, compat_str):
s = s.encode('utf-8')
return shlex.split(s, comments, posix)
@@ -416,26 +496,32 @@ if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3
else:
_terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines'])
- def compat_get_terminal_size():
- columns = compat_getenv('COLUMNS', None)
+ def compat_get_terminal_size(fallback=(80, 24)):
+ columns = compat_getenv('COLUMNS')
if columns:
columns = int(columns)
else:
columns = None
- lines = compat_getenv('LINES', None)
+ lines = compat_getenv('LINES')
if lines:
lines = int(lines)
else:
lines = None
- try:
- sp = subprocess.Popen(
- ['stty', 'size'],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- out, err = sp.communicate()
- lines, columns = map(int, out.split())
- except Exception:
- pass
+ if columns is None or lines is None or columns <= 0 or lines <= 0:
+ try:
+ sp = subprocess.Popen(
+ ['stty', 'size'],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = sp.communicate()
+ _lines, _columns = map(int, out.split())
+ except Exception:
+ _columns, _lines = _terminal_size(*fallback)
+
+ if columns is None or columns <= 0:
+ columns = _columns
+ if lines is None or lines <= 0:
+ lines = _lines
return _terminal_size(columns, lines)
try:
@@ -459,6 +545,7 @@ __all__ = [
'compat_chr',
'compat_cookiejar',
'compat_cookies',
+ 'compat_etree_fromstring',
'compat_expanduser',
'compat_get_terminal_size',
'compat_getenv',
@@ -483,6 +570,8 @@ __all__ = [
'compat_urllib_parse_unquote_to_bytes',
'compat_urllib_parse_urlparse',
'compat_urllib_request',
+ 'compat_urllib_request_DataHandler',
+ 'compat_urllib_response',
'compat_urlparse',
'compat_urlretrieve',
'compat_xml_parse_error',
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 97e755d4b..29a4500d3 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -325,7 +325,7 @@ class FileDownloader(object):
)
# Check file already present
- if filename != '-' and nooverwrites_and_exists or continuedl_and_exists:
+ if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
self.report_file_already_downloaded(filename)
self._hook_progress({
'filename': filename,
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 174180db5..6170cc155 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -5,12 +5,13 @@ import io
import itertools
import os
import time
-import xml.etree.ElementTree as etree
from .fragment import FragmentFD
from ..compat import (
+ compat_etree_fromstring,
compat_urlparse,
compat_urllib_error,
+ compat_urllib_parse_urlparse,
)
from ..utils import (
encodeFilename,
@@ -285,9 +286,11 @@ class F4mFD(FragmentFD):
man_url = info_dict['url']
requested_bitrate = info_dict.get('tbr')
self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
- manifest = self.ydl.urlopen(man_url).read()
+ urlh = self.ydl.urlopen(man_url)
+ man_url = urlh.geturl()
+ manifest = urlh.read()
- doc = etree.fromstring(manifest)
+ doc = compat_etree_fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f)
for f in self._get_unencrypted_media(doc)]
if requested_bitrate is None:
@@ -329,20 +332,25 @@ class F4mFD(FragmentFD):
if not live:
write_metadata_tag(dest_stream, metadata)
+ base_url_parsed = compat_urllib_parse_urlparse(base_url)
+
self._start_frag_download(ctx)
frags_filenames = []
while fragments_list:
seg_i, frag_i = fragments_list.pop(0)
name = 'Seg%d-Frag%d' % (seg_i, frag_i)
- url = base_url + name
+ query = []
+ if base_url_parsed.query:
+ query.append(base_url_parsed.query)
if akamai_pv:
- url += '?' + akamai_pv.strip(';')
+ query.append(akamai_pv.strip(';'))
if info_dict.get('extra_param_to_segment_url'):
- url += info_dict.get('extra_param_to_segment_url')
+ query.append(info_dict['extra_param_to_segment_url'])
+ url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
try:
- success = ctx['dl'].download(frag_filename, {'url': url})
+ success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()})
if not success:
return False
(down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index b2436e732..9a83a73dd 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -28,10 +28,19 @@ class HlsFD(FileDownloader):
return False
ffpp.check_version()
- args = [
- encodeArgument(opt)
- for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')]
- args.append(encodeFilename(tmpfilename, True))
+ args = [ffpp.executable, '-y']
+
+ if info_dict['http_headers'] and re.match(r'^https?://', url):
+ # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
+ # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
+ args += [
+ '-headers',
+ ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items())]
+
+ args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc']
+
+ args = [encodeArgument(opt) for opt in args]
+ args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))
self._debug_cmd(args)
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index 7d19bb808..f1d219ba9 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -105,7 +105,7 @@ class RtmpFD(FileDownloader):
protocol = info_dict.get('rtmp_protocol', None)
real_time = info_dict.get('rtmp_real_time', False)
no_resume = info_dict.get('no_resume', False)
- continue_dl = info_dict.get('continuedl', True)
+ continue_dl = self.params.get('continuedl', True)
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index fcd9edec3..08cb93d76 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -45,6 +45,7 @@ from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbc import (
BBCCoUkIE,
+ BBCCoUkArticleIE,
BBCIE,
)
from .beeg import BeegIE
@@ -79,6 +80,7 @@ from .cbssports import CBSSportsIE
from .ccc import CCCIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
+from .chaturbate import ChaturbateIE
from .chilloutzone import ChilloutzoneIE
from .chirbit import (
ChirbitIE,
@@ -91,6 +93,7 @@ from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .cloudy import CloudyIE
from .clubic import ClubicIE
+from .clyp import ClypIE
from .cmt import CMTIE
from .cnet import CNETIE
from .cnn import (
@@ -124,6 +127,7 @@ from .dbtv import DBTVIE
from .dcn import DCNIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
+from .democracynow import DemocracynowIE
from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
@@ -141,7 +145,6 @@ from .dump import DumpIE
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
-from .divxstage import DivxStageIE
from .dropbox import DropboxIE
from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE
@@ -162,6 +165,7 @@ from .eroprofile import EroProfileIE
from .escapist import EscapistIE
from .espn import ESPNIE
from .esri import EsriVideoIE
+from .europa import EuropaIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
@@ -169,14 +173,12 @@ from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
+from .fczenit import FczenitIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
from .fivetv import FiveTVIE
-from .fktv import (
- FKTVIE,
- FKTVPosteckeIE,
-)
+from .fktv import FKTVIE
from .flickr import FlickrIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
@@ -213,13 +215,15 @@ from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
-from .globo import GloboIE
+from .globo import (
+ GloboIE,
+ GloboArticleIE,
+)
from .godtube import GodTubeIE
from .goldenmoustache import GoldenMoustacheIE
from .golem import GolemIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
-from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
from .groupon import GrouponIE
from .hark import HarkIE
@@ -232,7 +236,6 @@ from .historicfilms import HistoricFilmsIE
from .history import HistoryIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hornbunny import HornBunnyIE
-from .hostingbulk import HostingBulkIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
@@ -302,6 +305,11 @@ from .lifenews import (
LifeNewsIE,
LifeEmbedIE,
)
+from .limelight import (
+ LimelightMediaIE,
+ LimelightChannelIE,
+ LimelightChannelListIE,
+)
from .liveleak import LiveLeakIE
from .livestream import (
LivestreamIE,
@@ -319,7 +327,6 @@ from .macgamestore import MacGameStoreIE
from .mailru import MailRuIE
from .malemotion import MalemotionIE
from .mdr import MDRIE
-from .megavideoz import MegaVideozIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mgoon import MgoonIE
@@ -370,6 +377,9 @@ from .nbc import (
from .ndr import (
NDRIE,
NJoyIE,
+ NDREmbedBaseIE,
+ NDREmbedIE,
+ NJoyEmbedIE,
)
from .ndtv import NDTVIE
from .netzkino import NetzkinoIE
@@ -405,7 +415,11 @@ from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
from .nova import NovaIE
from .novamov import NovaMovIE
-from .nowness import NownessIE
+from .nowness import (
+ NownessIE,
+ NownessPlaylistIE,
+ NownessSeriesIE,
+)
from .nowtv import NowTVIE
from .nowvideo import NowVideoIE
from .npo import (
@@ -435,7 +449,6 @@ from .ooyala import (
OoyalaIE,
OoyalaExternalIE,
)
-from .openfilm import OpenFilmIE
from .orf import (
ORFTVthekIE,
ORFOE1IE,
@@ -581,6 +594,7 @@ from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
+from .stitcher import StitcherIE
from .sport5 import Sport5IE
from .sportbox import (
SportBoxIE,
@@ -685,7 +699,7 @@ from .twitch import (
TwitchBookmarksIE,
TwitchStreamIE,
)
-from .twitter import TwitterCardIE
+from .twitter import TwitterCardIE, TwitterIE
from .ubu import UbuIE
from .udemy import (
UdemyIE,
@@ -712,7 +726,6 @@ from .vh1 import VH1IE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
-from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
from .videomega import VideoMegaIE
from .videopremium import VideoPremiumIE
@@ -722,6 +735,7 @@ from .vidme import VidmeIE
from .vidzi import VidziIE
from .vier import VierIE, VierVideosIE
from .viewster import ViewsterIE
+from .viidea import ViideaIE
from .vimeo import (
VimeoIE,
VimeoAlbumIE,
@@ -774,6 +788,7 @@ from .wrzuta import WrzutaIE
from .wsj import WSJIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
+from .xfileshare import XFileShareIE
from .xhamster import (
XHamsterIE,
XHamsterEmbedIE,
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index f9a389f67..c0e5d1abf 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -12,7 +12,7 @@ from ..utils import (
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
- _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)'
+ _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
@@ -36,6 +36,18 @@ class ABCIE(InfoExtractor):
'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',
},
'add_ie': ['Youtube'],
+ }, {
+ 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',
+ 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f',
+ 'info_dict': {
+ 'id': '6880080',
+ 'ext': 'mp3',
+ 'title': 'NAB lifts interest rates, following Westpac and CBA',
+ 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728',
+ },
+ }, {
+ 'url': 'http://www.abc.net.au/news/2015-10-19/6866214',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -43,7 +55,7 @@ class ABCIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
mobj = re.search(
- r'inline(?P<type>Video|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
+ r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
webpage)
if mobj is None:
raise ExtractorError('Unable to extract video urls')
@@ -60,11 +72,13 @@ class ABCIE(InfoExtractor):
formats = [{
'url': url_info['url'],
+ 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none',
'width': int_or_none(url_info.get('width')),
'height': int_or_none(url_info.get('height')),
'tbr': int_or_none(url_info.get('bitrate')),
'filesize': int_or_none(url_info.get('filesize')),
} for url_info in urls_info]
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 4327c2f61..3ae618e71 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
ExtractorError,
float_or_none,
xpath_text,
@@ -40,7 +41,8 @@ class AdultSwimIE(InfoExtractor):
'id': 'rQxZvXQ4ROaSOqq-or2Mow',
'title': 'Rick and Morty - Pilot',
'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- }
+ },
+ 'skip': 'This video is only available for registered users',
}, {
'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
'playlist': [
@@ -123,7 +125,6 @@ class AdultSwimIE(InfoExtractor):
else:
collections = bootstrapped_data['show']['collections']
collection, video_info = self.find_collection_containing_video(collections, episode_path)
-
# Video wasn't found in the collections, let's try `slugged_video`.
if video_info is None:
if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
@@ -133,7 +134,15 @@ class AdultSwimIE(InfoExtractor):
show = bootstrapped_data['show']
show_title = show['title']
- segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
+ stream = video_info.get('stream')
+ clips = [stream] if stream else video_info.get('clips')
+ if not clips:
+ raise ExtractorError(
+ 'This video is only available via cable service provider subscription that'
+ ' is not currently supported. You may want to use --cookies.'
+ if video_info.get('auth') is True else 'Unable to find stream or clips',
+ expected=True)
+ segment_ids = [clip['videoPlaybackID'] for clip in clips]
episode_id = video_info['id']
episode_title = video_info['title']
@@ -142,7 +151,7 @@ class AdultSwimIE(InfoExtractor):
entries = []
for part_num, segment_id in enumerate(segment_ids):
- segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id
+ segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id
segment_title = '%s - %s' % (show_title, episode_title)
if len(segment_ids) > 1:
@@ -158,17 +167,30 @@ class AdultSwimIE(InfoExtractor):
formats = []
file_els = idoc.findall('.//files/file') or idoc.findall('./files/file')
+ unique_urls = []
+ unique_file_els = []
for file_el in file_els:
+ media_url = file_el.text
+ if not media_url or determine_ext(media_url) == 'f4m':
+ continue
+ if file_el.text not in unique_urls:
+ unique_urls.append(file_el.text)
+ unique_file_els.append(file_el)
+
+ for file_el in unique_file_els:
bitrate = file_el.attrib.get('bitrate')
ftype = file_el.attrib.get('type')
-
- formats.append({
- 'format_id': '%s_%s' % (bitrate, ftype),
- 'url': file_el.text.strip(),
- # The bitrate may not be a number (for example: 'iphone')
- 'tbr': int(bitrate) if bitrate.isdigit() else None,
- 'quality': 1 if ftype == 'hd' else -1
- })
+ media_url = file_el.text
+ if determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, segment_title, 'mp4', preference=0, m3u8_id='hls'))
+ else:
+ formats.append({
+ 'format_id': '%s_%s' % (bitrate, ftype),
+ 'url': file_el.text.strip(),
+ # The bitrate may not be a number (for example: 'iphone')
+ 'tbr': int(bitrate) if bitrate.isdigit() else None,
+ })
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py
index 31f0d417c..23f942ae2 100644
--- a/youtube_dl/extractor/anitube.py
+++ b/youtube_dl/extractor/anitube.py
@@ -26,8 +26,8 @@ class AnitubeIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- key = self._html_search_regex(
- r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key')
+ key = self._search_regex(
+ r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key')
config_xml = self._download_xml(
'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key)
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 576f03b5b..f68dc3236 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -13,53 +13,53 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TESTS = [{
- "url": "http://trailers.apple.com/trailers/wb/manofsteel/",
+ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
'id': 'manofsteel',
},
- "playlist": [
+ 'playlist': [
{
- "md5": "d97a8e575432dbcb81b7c3acb741f8a8",
- "info_dict": {
- "id": "manofsteel-trailer4",
- "ext": "mov",
- "duration": 111,
- "title": "Trailer 4",
- "upload_date": "20130523",
- "uploader_id": "wb",
+ 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
+ 'info_dict': {
+ 'id': 'manofsteel-trailer4',
+ 'ext': 'mov',
+ 'duration': 111,
+ 'title': 'Trailer 4',
+ 'upload_date': '20130523',
+ 'uploader_id': 'wb',
},
},
{
- "md5": "b8017b7131b721fb4e8d6f49e1df908c",
- "info_dict": {
- "id": "manofsteel-trailer3",
- "ext": "mov",
- "duration": 182,
- "title": "Trailer 3",
- "upload_date": "20130417",
- "uploader_id": "wb",
+ 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
+ 'info_dict': {
+ 'id': 'manofsteel-trailer3',
+ 'ext': 'mov',
+ 'duration': 182,
+ 'title': 'Trailer 3',
+ 'upload_date': '20130417',
+ 'uploader_id': 'wb',
},
},
{
- "md5": "d0f1e1150989b9924679b441f3404d48",
- "info_dict": {
- "id": "manofsteel-trailer",
- "ext": "mov",
- "duration": 148,
- "title": "Trailer",
- "upload_date": "20121212",
- "uploader_id": "wb",
+ 'md5': 'd0f1e1150989b9924679b441f3404d48',
+ 'info_dict': {
+ 'id': 'manofsteel-trailer',
+ 'ext': 'mov',
+ 'duration': 148,
+ 'title': 'Trailer',
+ 'upload_date': '20121212',
+ 'uploader_id': 'wb',
},
},
{
- "md5": "5fe08795b943eb2e757fa95cb6def1cb",
- "info_dict": {
- "id": "manofsteel-teaser",
- "ext": "mov",
- "duration": 93,
- "title": "Teaser",
- "upload_date": "20120721",
- "uploader_id": "wb",
+ 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
+ 'info_dict': {
+ 'id': 'manofsteel-teaser',
+ 'ext': 'mov',
+ 'duration': 93,
+ 'title': 'Teaser',
+ 'upload_date': '20120721',
+ 'uploader_id': 'wb',
},
},
]
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 6f465789b..73be6d204 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -14,8 +14,8 @@ from ..utils import (
parse_duration,
unified_strdate,
xpath_text,
- parse_xml,
)
+from ..compat import compat_etree_fromstring
class ARDMediathekIE(InfoExtractor):
@@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):
raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
if re.search(r'[\?&]rss($|[=&])', url):
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 76de24477..2a00da3ee 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -4,6 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
find_xpath_attr,
unified_strdate,
@@ -77,7 +81,13 @@ class ArteTVPlus7IE(InfoExtractor):
def _extract_from_webpage(self, webpage, video_id, lang):
json_url = self._html_search_regex(
[r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'],
- webpage, 'json vp url')
+ webpage, 'json vp url', default=None)
+ if not json_url:
+ iframe_url = self._html_search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
+ webpage, 'iframe url', group='url')
+ json_url = compat_parse_qs(
+ compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
return self._extract_from_json_url(json_url, video_id, lang)
def _extract_from_json_url(self, json_url, video_id, lang):
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 505877b77..c1ef8051d 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -10,6 +10,8 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ float_or_none,
+ int_or_none,
)
@@ -52,11 +54,11 @@ class BandcampIE(InfoExtractor):
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
- 'url': format_url,
+ 'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
- 'abr': int(abr_str),
+ 'abr': int_or_none(abr_str),
})
self._sort_formats(formats)
@@ -65,7 +67,7 @@ class BandcampIE(InfoExtractor):
'id': compat_str(data['id']),
'title': data['title'],
'formats': formats,
- 'duration': float(data['duration']),
+ 'duration': float_or_none(data.get('duration')),
}
else:
raise ExtractorError('No free songs found')
@@ -93,8 +95,8 @@ class BandcampIE(InfoExtractor):
final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
# in the "download_url" key
- final_url = self._search_regex(
- r'"retry_url":"(.*?)"', final_url_webpage, 'final video URL')
+ final_url = self._proto_relative_url(self._search_regex(
+ r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index abc5a44a1..a55a6dbc9 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -11,19 +10,36 @@ from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
+ remove_end,
+ unescapeHTML,
+)
+from ..compat import (
+ compat_etree_fromstring,
+ compat_HTTPError,
)
-from ..compat import compat_HTTPError
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>[\da-z]{8})'
_MEDIASELECTOR_URLS = [
+ # Provides HQ HLS streams with even better quality that pc mediaset but fails
+ # with geolocation in some cases when it's even not geo restricted at all (e.g.
+ # http://www.bbc.co.uk/programmes/b06bp7lf)
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
]
+ _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
+ _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
+
+ _NAMESPACES = (
+ _MEDIASELECTION_NS,
+ _EMP_PLAYLIST_NS,
+ )
+
_TESTS = [
{
'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
@@ -153,6 +169,21 @@ class BBCCoUkIE(InfoExtractor):
},
'skip': 'geolocation',
}, {
+ # iptv-all mediaset fails with geolocation however there is no geo restriction
+ # for this programme at all
+ 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf',
+ 'info_dict': {
+ 'id': 'b06bp7kf',
+ 'ext': 'flv',
+ 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie",
+ 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.',
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
}, {
@@ -174,6 +205,7 @@ class BBCCoUkIE(InfoExtractor):
def _extract_connection(self, connection, programme_id):
formats = []
+ kind = connection.get('kind')
protocol = connection.get('protocol')
supplier = connection.get('supplier')
if protocol == 'http':
@@ -189,11 +221,17 @@ class BBCCoUkIE(InfoExtractor):
# Skip DASH until supported
elif transfer_format == 'dash':
pass
+ elif transfer_format == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=supplier, fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
# Direct link
else:
formats.append({
'url': href,
- 'format_id': supplier,
+ 'format_id': supplier or kind or protocol,
})
elif protocol == 'rtmp':
application = connection.get('application', 'ondemand')
@@ -213,16 +251,24 @@ class BBCCoUkIE(InfoExtractor):
return formats
def _extract_items(self, playlist):
- return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
+ return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
+
+ def _findall_ns(self, element, xpath):
+ elements = []
+ for ns in self._NAMESPACES:
+ elements.extend(element.findall(xpath % ns))
+ return elements
def _extract_medias(self, media_selection):
- error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
+ error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
+ if error is None:
+ media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
if error is not None:
raise BBCCoUkIE.MediaSelectionError(error.get('id'))
- return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
+ return self._findall_ns(media_selection, './{%s}media')
def _extract_connections(self, media):
- return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
+ return self._findall_ns(media, './{%s}connection')
def _extract_video(self, media, programme_id):
formats = []
@@ -236,13 +282,14 @@ class BBCCoUkIE(InfoExtractor):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
- 'format_id': '%s_%s' % (service, format['format_id']),
'width': width,
'height': height,
'vbr': vbr,
'vcodec': vcodec,
'filesize': file_size,
})
+ if service:
+ format['format_id'] = '%s_%s' % (service, format['format_id'])
formats.extend(conn_formats)
return formats
@@ -287,7 +334,7 @@ class BBCCoUkIE(InfoExtractor):
return self._download_media_selector_url(
mediaselector_url % programme_id, programme_id)
except BBCCoUkIE.MediaSelectionError as e:
- if e.id == 'notukerror':
+ if e.id in ('notukerror', 'geolocation'):
last_exception = e
continue
self._raise_extractor_error(e)
@@ -299,7 +346,7 @@ class BBCCoUkIE(InfoExtractor):
url, programme_id, 'Downloading media selection XML')
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
- media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
+ media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
else:
raise
return self._process_media_selector(media_selection, programme_id)
@@ -357,7 +404,7 @@ class BBCCoUkIE(InfoExtractor):
url, playlist_id, 'Downloading legacy playlist XML')
def _extract_from_legacy_playlist(self, playlist, playlist_id):
- no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
+ no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
if no_items is not None:
reason = no_items.get('reason')
if reason == 'preAvailability':
@@ -374,8 +421,9 @@ class BBCCoUkIE(InfoExtractor):
kind = item.get('kind')
if kind != 'programme' and kind != 'radioProgramme':
continue
- title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
- description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
+ title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
+ description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
+ description = description_el.text if description_el is not None else None
def get_programme_id(item):
def get_from_attributes(item):
@@ -384,16 +432,18 @@ class BBCCoUkIE(InfoExtractor):
if value and re.match(r'^[pb][\da-z]{7}$', value):
return value
get_from_attributes(item)
- mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator')
+ mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
if mediator is not None:
return get_from_attributes(mediator)
programme_id = get_programme_id(item)
duration = int_or_none(item.get('duration'))
- # TODO: programme_id can be None and media items can be incorporated right inside
- # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
- # as f4m and m3u8
- formats, subtitles = self._download_media_selector(programme_id)
+
+ if programme_id:
+ formats, subtitles = self._download_media_selector(programme_id)
+ else:
+ formats, subtitles = self._process_media_selector(item, playlist_id)
+ programme_id = playlist_id
return programme_id, title, description, duration, formats, subtitles
@@ -445,6 +495,9 @@ class BBCIE(BBCCoUkIE):
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
_MEDIASELECTOR_URLS = [
+ # Provides HQ HLS streams but fails with geolocation in some cases when it's
+ # even not geo restricted at all
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
# Provides more formats, namely direct mp4 links, but fails on some videos with
# notukerror for non UK (?) users (e.g.
# http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
@@ -454,8 +507,7 @@ class BBCIE(BBCCoUkIE):
]
_TESTS = [{
- # article with multiple videos embedded with data-media-meta containing
- # playlist.sxml, externalId and no direct video links
+ # article with multiple videos embedded with data-playable containing vpids
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
@@ -464,7 +516,7 @@ class BBCIE(BBCCoUkIE):
},
'playlist_count': 2,
}, {
- # article with multiple videos embedded with data-media-meta (more videos)
+ # article with multiple videos embedded with data-playable (more videos)
'url': 'http://www.bbc.com/news/business-28299555',
'info_dict': {
'id': 'business-28299555',
@@ -475,6 +527,7 @@ class BBCIE(BBCCoUkIE):
'skip': 'Save time',
}, {
# article with multiple videos embedded with `new SMP()`
+ # broken
'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
@@ -482,12 +535,13 @@ class BBCIE(BBCCoUkIE):
},
'playlist_count': 18,
}, {
- # single video embedded with mediaAssetPage.init()
+ # single video embedded with data-playable containing vpid
'url': 'http://www.bbc.com/news/world-europe-32041533',
'info_dict': {
'id': 'p02mprgb',
'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+ 'description': 'md5:2868290467291b37feda7863f7a83f54',
'duration': 47,
'timestamp': 1427219242,
'upload_date': '20150324',
@@ -497,15 +551,14 @@ class BBCIE(BBCCoUkIE):
'skip_download': True,
}
}, {
- # article with single video embedded with data-media-meta containing
- # direct video links (for now these are extracted) and playlist.xml (with
- # media items as f4m and m3u8 - currently unsupported)
+ # article with single video embedded with data-playable containing XML playlist
+ # with direct video links as progressiveDownloadUrl (for now these are extracted)
+ # and playlist with f4m and m3u8 as streamingUrl
'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
'info_dict': {
'id': '150615_telabyad_kentin_cogu',
'ext': 'mp4',
'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
- 'duration': 47,
'timestamp': 1434397334,
'upload_date': '20150615',
},
@@ -513,13 +566,12 @@ class BBCIE(BBCCoUkIE):
'skip_download': True,
}
}, {
- # single video embedded with mediaAssetPage.init() (regional section)
+ # single video embedded with data-playable containing XML playlists (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
'info_dict': {
'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
- 'duration': 87,
'timestamp': 1434713142,
'upload_date': '20150619',
},
@@ -561,21 +613,21 @@ class BBCIE(BBCCoUkIE):
'ext': 'mp4',
'title': 'Hyundai Santa Fe Sport: Rock star',
'description': 'md5:b042a26142c4154a6e472933cf20793d',
- 'timestamp': 1368473503,
- 'upload_date': '20130513',
+ 'timestamp': 1415867444,
+ 'upload_date': '20141113',
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
- # single video with playlist.sxml URL
+ # single video with playlist.sxml URL in playlist param
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
- 'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
+ 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
'duration': 140,
},
'params': {
@@ -583,6 +635,14 @@ class BBCIE(BBCCoUkIE):
'skip_download': True,
}
}, {
+ # article with multiple videos embedded with playlist.sxml in playlist param
+ 'url': 'http://www.bbc.com/sport/0/football/34475836',
+ 'info_dict': {
+ 'id': '34475836',
+ 'title': 'What Liverpool can expect from Klopp',
+ },
+ 'playlist_count': 3,
+ }, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
'only_matching': True,
@@ -594,7 +654,7 @@ class BBCIE(BBCCoUkIE):
@classmethod
def suitable(cls, url):
- return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url)
+ return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
def _extract_from_media_meta(self, media_meta, video_id):
# Direct links to media in media metadata (e.g.
@@ -623,40 +683,107 @@ class BBCIE(BBCCoUkIE):
return [], []
+ def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
+ programme_id, title, description, duration, formats, subtitles = \
+ self._process_legacy_playlist_url(url, playlist_id)
+ self._sort_formats(formats)
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- timestamp = parse_iso8601(self._search_regex(
- [r'"datePublished":\s*"([^"]+)',
- r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
- r'itemprop="datePublished"[^>]+datetime="([^"]+)"'],
- webpage, 'date', default=None))
-
- # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng)
- playlist = self._search_regex(
- r'<param[^>]+name="playlist"[^>]+value="([^"]+)"',
- webpage, 'playlist', default=None)
- if playlist:
- programme_id, title, description, duration, formats, subtitles = \
- self._process_legacy_playlist_url(playlist, playlist_id)
- self._sort_formats(formats)
- return {
- 'id': programme_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'timestamp': timestamp,
- 'formats': formats,
- 'subtitles': subtitles,
- }
+ timestamp = None
+ playlist_title = None
+ playlist_description = None
+
+ ld = self._parse_json(
+ self._search_regex(
+ r'(?s)<script type="application/ld\+json">(.+?)</script>',
+ webpage, 'ld json', default='{}'),
+ playlist_id, fatal=False)
+ if ld:
+ timestamp = parse_iso8601(ld.get('datePublished'))
+ playlist_title = ld.get('headline')
+ playlist_description = ld.get('articleBody')
+
+ if not timestamp:
+ timestamp = parse_iso8601(self._search_regex(
+ [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
+ r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
+ r'"datePublished":\s*"([^"]+)'],
+ webpage, 'date', default=None))
+
+ entries = []
+
+ # article with multiple videos embedded with playlist.sxml (e.g.
+ # http://www.bbc.com/sport/0/football/34475836)
+ playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
+ if playlists:
+ entries = [
+ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
+ for playlist_url in playlists]
+
+ # news article with multiple videos embedded with data-playable
+ data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
+ if data_playables:
+ for _, data_playable_json in data_playables:
+ data_playable = self._parse_json(
+ unescapeHTML(data_playable_json), playlist_id, fatal=False)
+ if not data_playable:
+ continue
+ settings = data_playable.get('settings', {})
+ if settings:
+ # data-playable with video vpid in settings.playlistObject.items (e.g.
+ # http://www.bbc.com/news/world-us-canada-34473351)
+ playlist_object = settings.get('playlistObject', {})
+ if playlist_object:
+ items = playlist_object.get('items')
+ if items and isinstance(items, list):
+ title = playlist_object['title']
+ description = playlist_object.get('summary')
+ duration = int_or_none(items[0].get('duration'))
+ programme_id = items[0].get('vpid')
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ else:
+ # data-playable without vpid but with a playlist.sxml URLs
+ # in otherSettings.playlist (e.g.
+ # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
+ playlist = data_playable.get('otherSettings', {}).get('playlist', {})
+ if playlist:
+ entries.append(self._extract_from_playlist_sxml(
+ playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
+
+ if entries:
+ playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
+ playlist_description = playlist_description or self._og_search_description(webpage, default=None)
+ return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
[r'data-video-player-vpid="([\da-z]{8})"',
r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'],
webpage, 'vpid', default=None)
+
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
@@ -778,3 +905,33 @@ class BBCIE(BBCCoUkIE):
})
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+
+class BBCCoUkArticleIE(InfoExtractor):
+ _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
+ IE_NAME = 'bbc.co.uk:article'
+ IE_DESC = 'BBC articles'
+
+ _TEST = {
+ 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
+ 'info_dict': {
+ 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
+ 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
+ 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
+ },
+ 'playlist_count': 4,
+ 'add_ie': ['BBCCoUk'],
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage).strip()
+
+ entries = [self.url_result(programme_url) for programme_url in re.findall(
+ r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index b38057f2f..61bc2f744 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -1,65 +1,69 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
_TEST = {
'url': 'http://beeg.com/5416503',
- 'md5': '1bff67111adb785c51d1b42959ec10e5',
+ 'md5': '46c384def73b33dbc581262e5ee67cef',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
'title': 'Sultry Striptease',
- 'description': 'md5:6db3c6177972822aaba18652ff59c773',
- 'categories': list, # NSFW
- 'thumbnail': 're:https?://.*\.jpg$',
+ 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
+ 'timestamp': 1391813355,
+ 'upload_date': '20140207',
+ 'duration': 383,
+ 'tags': list,
'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- webpage = self._download_webpage(url, video_id)
-
- quality_arr = self._search_regex(
- r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats')
+ video_id = self._match_id(url)
- formats = [{
- 'url': fmt[1],
- 'format_id': fmt[0],
- 'height': int(fmt[0][:-1]),
- } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)]
+ video = self._download_json(
+ 'http://beeg.com/api/v1/video/%s' % video_id, video_id)
+ formats = []
+ for format_id, video_url in video.items():
+ if not video_url:
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
+ if not height:
+ continue
+ formats.append({
+ 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'),
+ 'format_id': format_id,
+ 'height': int(height),
+ })
self._sort_formats(formats)
- title = self._html_search_regex(
- r'<title>([^<]+)\s*-\s*beeg\.?</title>', webpage, 'title')
+ title = video['title']
+ video_id = video.get('id') or video_id
+ display_id = video.get('code')
+ description = video.get('desc')
- description = self._html_search_regex(
- r'<meta name="description" content="([^"]*)"',
- webpage, 'description', fatal=False)
- thumbnail = self._html_search_regex(
- r'\'previewer.url\'\s*:\s*"([^"]*)"',
- webpage, 'thumbnail', fatal=False)
+ timestamp = parse_iso8601(video.get('date'), ' ')
+ duration = int_or_none(video.get('duration'))
- categories_str = self._html_search_regex(
- r'<meta name="keywords" content="([^"]+)"', webpage, 'categories', fatal=False)
- categories = (
- None if categories_str is None
- else categories_str.split(','))
+ tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
- 'categories': categories,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'tags': tags,
'formats': formats,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py
index 4d8cce1ef..1a0184861 100644
--- a/youtube_dl/extractor/bild.py
+++ b/youtube_dl/extractor/bild.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
- fix_xml_ampersands,
+ unescapeHTML,
)
@@ -17,26 +17,24 @@ class BildIE(InfoExtractor):
'info_dict': {
'id': '38184146',
'ext': 'mp4',
- 'title': 'BILD hat sie getestet',
+ 'title': 'Das können die neuen iPads',
+ 'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 196,
- 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
- xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml"
- doc = self._download_xml(xml_url, video_id, transform_source=fix_xml_ampersands)
-
- duration = int_or_none(doc.attrib.get('duration'), scale=1000)
+ video_data = self._download_json(
+ url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
return {
'id': video_id,
- 'title': doc.attrib['ueberschrift'],
- 'description': doc.attrib.get('text'),
- 'url': doc.attrib['src'],
- 'thumbnail': doc.attrib.get('img'),
- 'duration': duration,
+ 'title': unescapeHTML(video_data['title']).strip(),
+ 'description': unescapeHTML(video_data.get('description')),
+ 'url': video_data['clipList'][0]['srces'][0]['src'],
+ 'thumbnail': video_data.get('poster'),
+ 'duration': int_or_none(video_data.get('durationSec')),
}
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index ecc17ebeb..6c66a1236 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
import itertools
import json
-import xml.etree.ElementTree as ET
from .common import InfoExtractor
+from ..compat import (
+ compat_etree_fromstring,
+)
from ..utils import (
int_or_none,
unified_strdate,
@@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor):
except ValueError:
pass
- lq_doc = ET.fromstring(lq_page)
+ lq_doc = compat_etree_fromstring(lq_page)
lq_durls = lq_doc.findall('./durl')
hq_doc = self._download_xml(
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index c6ad1d065..2c7d968a8 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -3,10 +3,10 @@ from __future__ import unicode_literals
import re
import json
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_parse_qs,
compat_str,
compat_urllib_parse,
@@ -123,7 +123,7 @@ class BrightcoveIE(InfoExtractor):
object_str = fix_xml_ampersands(object_str)
try:
- object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+ object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
except compat_xml_parse_error:
return
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index c4fefefe4..f6a1ff381 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -4,38 +4,53 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import parse_duration
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
- _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+ 'url': 'http://www.canalc2.tv/video/12163',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'mp4',
- 'title': 'Terrasses du Numérique'
+ 'ext': 'flv',
+ 'title': 'Terrasses du Numérique',
+ 'duration': 122,
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
}
}
def _real_extract(self, url):
- video_id = re.match(self._VALID_URL, url).group('id')
- # We need to set the voir field for getting the file name
- url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- file_name = self._search_regex(
- r"so\.addVariable\('file','(.*?)'\);",
- webpage, 'file name')
- video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+ video_url = self._search_regex(
+ r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P<file>.+?)\2',
+ webpage, 'video_url', group='file')
+ formats = [{'url': video_url}]
+ if video_url.startswith('rtmp://'):
+ rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
+ formats[0].update({
+ 'url': rtmp.group('url'),
+ 'ext': 'flv',
+ 'app': rtmp.group('app'),
+ 'play_path': rtmp.group('play_path'),
+ 'page_url': url,
+ })
title = self._html_search_regex(
- r'class="evenement8">(.*?)</a>', webpage, 'title')
+ r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title')
+ duration = parse_duration(self._search_regex(
+ r'id=["\']video_duree["\'][^>]*>([^<]+)',
+ webpage, 'duration', fatal=False))
return {
'id': video_id,
- 'ext': 'mp4',
- 'url': video_url,
'title': title,
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 57e0cda2c..004372f8d 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -78,7 +78,8 @@ class CanalplusIE(InfoExtractor):
if video_id is None:
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
- r'<canal:player[^>]+?videoId="(\d+)"', webpage, 'video id')
+ [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'],
+ webpage, 'video id', group='id')
info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
doc = self._download_xml(info_url, video_id, 'Downloading video XML')
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index 75fffb156..43f05d278 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -1,6 +1,8 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import smuggle_url
class CBSIE(InfoExtractor):
@@ -46,13 +48,19 @@ class CBSIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
+ request = compat_urllib_request.Request(url)
+ # Android UA is served with higher quality (720p) streams (see
+ # https://github.com/rg3/youtube-dl/issues/7490)
+ request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)')
+ webpage = self._download_webpage(request, display_id)
real_id = self._search_regex(
[r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
webpage, 'real video ID')
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
- 'url': 'theplatform:%s' % real_id,
+ 'url': smuggle_url(
+ 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id,
+ {'force_smil_url': True}),
'display_id': display_id,
}
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 52e61d85b..f9a64a0a2 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -67,9 +67,12 @@ class CBSNewsIE(InfoExtractor):
'format_id': format_id,
}
if uri.startswith('rtmp'):
+ play_path = re.sub(
+ r'{slistFilePath}', '',
+ uri.split('<break>')[-1].split('{break}')[-1])
fmt.update({
'app': 'ondemand?auth=cbs',
- 'play_path': 'mp4:' + uri.split('<break>')[-1],
+ 'play_path': 'mp4:' + play_path,
'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf',
'page_url': 'http://www.cbsnews.com',
'ext': 'flv',
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 3dfc24f5b..c74553dcf 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -3,7 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ parse_filesize,
+ qualities,
+)
class Channel9IE(InfoExtractor):
@@ -28,7 +32,7 @@ class Channel9IE(InfoExtractor):
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
- 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ 'thumbnail': 're:http://.*\.jpg',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
@@ -44,31 +48,29 @@ class Channel9IE(InfoExtractor):
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,
- 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ 'thumbnail': 're:http://.*\.jpg',
'authors': ['Mike Wilmot'],
},
+ },
+ {
+ # low quality mp4 is best
+ 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'info_dict': {
+ 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'ext': 'mp4',
+ 'title': 'Ranges for the Standard Library',
+ 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+ 'duration': 5646,
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}
]
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
- # Sorted by quality
- _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
-
- def _restore_bytes(self, formatted_size):
- if not formatted_size:
- return 0
- m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
- if not m:
- return 0
- units = m.group('units')
- try:
- exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
- except ValueError:
- return 0
- size = float(m.group('size'))
- return int(size * (1024 ** exponent))
-
def _formats_from_html(self, html):
FORMAT_REGEX = r'''
(?x)
@@ -78,16 +80,20 @@ class Channel9IE(InfoExtractor):
<h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
</div>)? # File size part may be missing
'''
- # Extract known formats
+ quality = qualities((
+ 'MP3', 'MP4',
+ 'Low Quality WMV', 'Low Quality MP4',
+ 'Mid Quality WMV', 'Mid Quality MP4',
+ 'High Quality WMV', 'High Quality MP4'))
formats = [{
'url': x.group('url'),
'format_id': x.group('quality'),
'format_note': x.group('note'),
'format': '%s (%s)' % (x.group('quality'), x.group('note')),
- 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
- 'preference': self._known_formats.index(x.group('quality')),
+ 'filesize_approx': parse_filesize(x.group('filesize')),
+ 'quality': quality(x.group('quality')),
'vcodec': 'none' if x.group('note') == 'Audio only' else None,
- } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+ } for x in list(re.finditer(FORMAT_REGEX, html))]
self._sort_formats(formats)
@@ -158,7 +164,7 @@ class Channel9IE(InfoExtractor):
def _extract_session_day(self, html):
m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
- return m.group('day') if m is not None else None
+ return m.group('day').strip() if m is not None else None
def _extract_session_room(self, html):
m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
@@ -224,12 +230,12 @@ class Channel9IE(InfoExtractor):
if contents is None:
return contents
- authors = self._extract_authors(html)
+ if len(contents) > 1:
+ raise ExtractorError('Got more than one entry')
+ result = contents[0]
+ result['authors'] = self._extract_authors(html)
- for content in contents:
- content['authors'] = authors
-
- return contents
+ return result
def _extract_session(self, html, content_path):
contents = self._extract_content(html, content_path)
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py
new file mode 100644
index 000000000..0b67ba67d
--- /dev/null
+++ b/youtube_dl/extractor/chaturbate.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class ChaturbateIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.chaturbate.com/siswet19/',
+ 'info_dict': {
+ 'id': 'siswet19',
+ 'ext': 'mp4',
+ 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'age_limit': 18,
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://en.chaturbate.com/siswet19/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ m3u8_url = self._search_regex(
+ r'src=(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage,
+ 'playlist', default=None, group='url')
+
+ if not m3u8_url:
+ error = self._search_regex(
+ r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
+ webpage, 'error', group='error')
+ raise ExtractorError(error, expected=True)
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'title': self._live_title(video_id),
+ 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id,
+ 'age_limit': self._rta_search(webpage),
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py
index 14f215c5c..1dfa7c12e 100644
--- a/youtube_dl/extractor/clubic.py
+++ b/youtube_dl/extractor/clubic.py
@@ -12,9 +12,9 @@ from ..utils import (
class ClubicIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html'
+ _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',
'md5': '1592b694ba586036efac1776b0b43cd3',
'info_dict': {
@@ -24,7 +24,10 @@ class ClubicIE(InfoExtractor):
'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*',
'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$',
}
- }
+ }, {
+ 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py
new file mode 100644
index 000000000..57e643799
--- /dev/null
+++ b/youtube_dl/extractor/clyp.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ parse_iso8601,
+)
+
+
+class ClypIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)'
+ _TEST = {
+ 'url': 'https://clyp.it/ojz2wfah',
+ 'md5': '1d4961036c41247ecfdcc439c0cddcbb',
+ 'info_dict': {
+ 'id': 'ojz2wfah',
+ 'ext': 'mp3',
+ 'title': 'Krisson80 - bits wip wip',
+ 'description': '#Krisson80BitsWipWip #chiptune\n#wip',
+ 'duration': 263.21,
+ 'timestamp': 1443515251,
+ 'upload_date': '20150929',
+ },
+ }
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+
+ metadata = self._download_json(
+ 'https://api.clyp.it/%s' % audio_id, audio_id)
+
+ formats = []
+ for secure in ('', 'Secure'):
+ for ext in ('Ogg', 'Mp3'):
+ format_id = '%s%s' % (secure, ext)
+ format_url = metadata.get('%sUrl' % format_id)
+ if format_url:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'vcodec': 'none',
+ })
+ self._sort_formats(formats)
+
+ title = metadata['Title']
+ description = metadata.get('Description')
+ duration = float_or_none(metadata.get('Duration'))
+ timestamp = parse_iso8601(metadata.get('DateCreated'))
+
+ return {
+ 'id': audio_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py
index e96c59f71..f1311b14f 100644
--- a/youtube_dl/extractor/cmt.py
+++ b/youtube_dl/extractor/cmt.py
@@ -4,7 +4,7 @@ from .mtv import MTVIE
class CMTIE(MTVIE):
IE_NAME = 'cmt.com'
- _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'
+ _VALID_URL = r'https?://www\.cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P<videoid>\d+)'
_FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/'
_TESTS = [{
@@ -16,4 +16,7 @@ class CMTIE(MTVIE):
'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
'description': 'Blame It All On My Roots',
},
+ }, {
+ 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172',
+ 'only_matching': True,
}]
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 91ebb0ce5..3e4bd10b6 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -151,12 +151,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
mobj = re.match(self._VALID_URL, url)
if mobj.group('shortname'):
- if mobj.group('shortname') in ('tds', 'thedailyshow'):
- url = 'http://thedailyshow.cc.com/full-episodes/'
- else:
- url = 'http://thecolbertreport.cc.com/full-episodes/'
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- assert mobj is not None
+ return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes')
if mobj.group('clip'):
if mobj.group('videotitle'):
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 5eeeda08d..5e263f8b5 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,13 +10,11 @@ import re
import socket
import sys
import time
-import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
compat_cookies,
compat_getpass,
- compat_HTTPError,
compat_http_client,
compat_urllib_error,
compat_urllib_parse,
@@ -24,6 +22,7 @@ from ..compat import (
compat_urllib_request,
compat_urlparse,
compat_str,
+ compat_etree_fromstring,
)
from ..utils import (
NO_DEFAULT,
@@ -39,6 +38,7 @@ from ..utils import (
RegexNotFoundError,
sanitize_filename,
unescapeHTML,
+ unified_strdate,
url_basename,
xpath_text,
xpath_with_ns,
@@ -152,6 +152,7 @@ class InfoExtractor(object):
description: Full video description.
uploader: Full name of the video uploader.
creator: The main artist who created the video.
+ release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
@@ -163,12 +164,14 @@ class InfoExtractor(object):
with the "ext" entry and one of:
* "data": The subtitles file contents
* "url": A URL pointing to the subtitles file
+ "ext" will be calculated from URL if missing
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
+ repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
@@ -307,11 +310,11 @@ class InfoExtractor(object):
@classmethod
def ie_key(cls):
"""A string for getting the InfoExtractor with get_info_extractor"""
- return cls.__name__[:-2]
+ return compat_str(cls.__name__[:-2])
@property
def IE_NAME(self):
- return type(self).__name__[:-2]
+ return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
@@ -458,7 +461,7 @@ class InfoExtractor(object):
return xml_string
if transform_source:
xml_string = transform_source(xml_string)
- return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
@@ -516,6 +519,12 @@ class InfoExtractor(object):
'%s. Use --username and --password or --netrc to provide account credentials.' % msg,
expected=True)
+ @staticmethod
+ def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+ raise ExtractorError(
+ '%s. You might want to use --proxy to workaround.' % msg,
+ expected=True)
+
# Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None, video_title=None):
@@ -636,8 +645,9 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
- property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+ property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+ % {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@@ -731,8 +741,9 @@ class InfoExtractor(object):
@staticmethod
def _hidden_inputs(html):
+ html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
hidden_inputs = {}
- for input in re.findall(r'<input([^>]+)>', html):
+ for input in re.findall(r'(?i)<input([^>]+)>', html):
if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
continue
name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
@@ -746,7 +757,7 @@ class InfoExtractor(object):
def _form_hidden_inputs(self, form_id, html):
form = self._search_regex(
- r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+ r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
html, '%s form' % form_id, group='form')
return self._hidden_inputs(form)
@@ -830,7 +841,7 @@ class InfoExtractor(object):
self._request_webpage(url, video_id, 'Checking %s URL' % item)
return True
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
+ if isinstance(e.cause, compat_urllib_error.URLError):
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, item))
return False
@@ -861,13 +872,18 @@ class InfoExtractor(object):
time.sleep(timeout)
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
- transform_source=lambda s: fix_xml_ampersands(s).strip()):
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
- transform_source=transform_source)
+ transform_source=transform_source,
+ fatal=fatal)
+
+ if manifest is False:
+ return manifest
formats = []
manifest_version = '1.0'
@@ -888,7 +904,10 @@ class InfoExtractor(object):
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
if determine_ext(manifest_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+ f4m_formats = self._extract_f4m_formats(
+ manifest_url, video_id, preference, f4m_id, fatal=fatal)
+ if f4m_formats:
+ formats.extend(f4m_formats)
continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
@@ -924,13 +943,15 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc = self._download_webpage(
+ res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
- if m3u8_doc is False:
- return m3u8_doc
+ if res is False:
+ return res
+ m3u8_doc, urlh = res
+ m3u8_url = urlh.geturl()
last_info = None
last_media = None
kv_rex = re.compile(
@@ -1036,6 +1057,7 @@ class InfoExtractor(object):
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
description = None
+ upload_date = None
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
name = meta.attrib.get('name')
content = meta.attrib.get('content')
@@ -1045,11 +1067,22 @@ class InfoExtractor(object):
title = content
elif not description and name in ('description', 'abstract'):
description = content
+ elif not upload_date and name == 'date':
+ upload_date = unified_strdate(content)
+
+ thumbnails = [{
+ 'id': image.get('type'),
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
return {
'id': video_id,
'title': title or video_id,
'description': description,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
'formats': formats,
'subtitles': subtitles,
}
@@ -1076,7 +1109,7 @@ class InfoExtractor(object):
if not src:
continue
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
@@ -1108,8 +1141,10 @@ class InfoExtractor(object):
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
if proto == 'm3u8' or src_ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+ m3u8_formats = self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
continue
if src_ext == 'f4m':
@@ -1121,10 +1156,12 @@ class InfoExtractor(object):
}
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse.urlencode(f4m_params)
- formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+ f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
continue
- if src_url.startswith('http'):
+ if src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'url': src_url,
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 3db4db4e4..6f92ae2ed 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..compat import (
@@ -12,6 +11,7 @@ from ..compat import (
)
from ..utils import (
orderedSet,
+ remove_end,
)
@@ -24,21 +24,33 @@ class CondeNastIE(InfoExtractor):
# The keys are the supported sites and the values are the name to be shown
# to the user and in the extractor description.
_SITES = {
- 'wired': 'WIRED',
+ 'allure': 'Allure',
+ 'architecturaldigest': 'Architectural Digest',
+ 'arstechnica': 'Ars Technica',
+ 'bonappetit': 'Bon Appétit',
+ 'brides': 'Brides',
+ 'cnevids': 'Condé Nast',
+ 'cntraveler': 'Condé Nast Traveler',
+ 'details': 'Details',
+ 'epicurious': 'Epicurious',
+ 'glamour': 'Glamour',
+ 'golfdigest': 'Golf Digest',
'gq': 'GQ',
+ 'newyorker': 'The New Yorker',
+ 'self': 'SELF',
+ 'teenvogue': 'Teen Vogue',
+ 'vanityfair': 'Vanity Fair',
'vogue': 'Vogue',
- 'glamour': 'Glamour',
+ 'wired': 'WIRED',
'wmagazine': 'W Magazine',
- 'vanityfair': 'Vanity Fair',
- 'cnevids': 'Condé Nast',
}
- _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
- EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed)/.+?' % '|'.join(_SITES.keys())
+ EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
- _TEST = {
+ _TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
'md5': '1921f713ed48aabd715691f774c451f7',
'info_dict': {
@@ -47,7 +59,16 @@ class CondeNastIE(InfoExtractor):
'title': '3D Printed Speakers Lit With LED',
'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
}
- }
+ }, {
+ # JS embed
+ 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
+ 'md5': 'f1a6f9cafb7083bab74a710f65d08999',
+ 'info_dict': {
+ 'id': '55f9cf8b61646d1acf00000c',
+ 'ext': 'mp4',
+ 'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
+ }
+ }]
def _extract_series(self, url, webpage):
title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>',
@@ -86,8 +107,8 @@ class CondeNastIE(InfoExtractor):
info_url = base_info_url + data
info_page = self._download_webpage(info_url, video_id,
'Downloading video info')
- video_info = self._search_regex(r'var video = ({.+?});', info_page, 'video info')
- video_info = json.loads(video_info)
+ video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info')
+ video_info = self._parse_json(video_info, video_id)
formats = [{
'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']),
@@ -111,6 +132,13 @@ class CondeNastIE(InfoExtractor):
url_type = mobj.group('type')
item_id = mobj.group('id')
+ # Convert JS embed to regular embed
+ if url_type == 'embedjs':
+ parsed_url = compat_urlparse.urlparse(url)
+ url = compat_urlparse.urlunparse(parsed_url._replace(
+ path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
+ url_type = 'embed'
+
self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
webpage = self._download_webpage(url, item_id)
diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py
index 4fb178165..dedb810a0 100644
--- a/youtube_dl/extractor/criterion.py
+++ b/youtube_dl/extractor/criterion.py
@@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor):
final_url = self._search_regex(
r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url')
title = self._og_search_title(webpage)
- description = self._html_search_regex(
- r'<meta name="description" content="(.+?)" />',
- webpage, 'video description')
+ description = self._html_search_meta('description', webpage)
thumbnail = self._search_regex(
r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
webpage, 'thumbnail url')
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 95952bc29..6e5999c72 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -5,12 +5,12 @@ import re
import json
import base64
import zlib
-import xml.etree.ElementTree
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_request,
@@ -21,6 +21,7 @@ from ..utils import (
bytes_to_intlist,
intlist_to_bytes,
int_or_none,
+ lowercase_escape,
remove_end,
unified_strdate,
urlencode_postdata,
@@ -32,6 +33,26 @@ from ..aes import (
class CrunchyrollBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'crunchyroll'
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ login_url = 'https://www.crunchyroll.com/?a=formhandler'
+ data = urlencode_postdata({
+ 'formname': 'RpcApiUser_Login',
+ 'name': username,
+ 'password': password,
+ })
+ login_request = compat_urllib_request.Request(login_url, data)
+ login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ self._download_webpage(login_request, None, False, 'Wrong login info')
+
+ def _real_initialize(self):
+ self._login()
+
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
else compat_urllib_request.Request(url_or_request))
@@ -46,10 +67,22 @@ class CrunchyrollBaseIE(InfoExtractor):
return super(CrunchyrollBaseIE, self)._download_webpage(
request, video_id, note, errnote, fatal, tries, timeout, encoding)
+ @staticmethod
+ def _add_skip_wall(url):
+ parsed_url = compat_urlparse.urlparse(url)
+ qs = compat_urlparse.parse_qs(parsed_url.query)
+ # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
+ # > This content may be inappropriate for some people.
+ # > Are you sure you want to continue?
+ # since it's not disabled by default in crunchyroll account's settings.
+ # See https://github.com/rg3/youtube-dl/issues/7202.
+ qs['skip_wall'] = ['1']
+ return compat_urlparse.urlunparse(
+ parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
class CrunchyrollIE(CrunchyrollBaseIE):
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
- _NETRC_MACHINE = 'crunchyroll'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'info_dict': {
@@ -72,7 +105,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'id': '589804',
'ext': 'flv',
'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
- 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
+ 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'Danny Choo Network',
'upload_date': '20120213',
@@ -81,10 +114,13 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# rtmp
'skip_download': True,
},
-
}, {
'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
'only_matching': True,
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium available
+ 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
+ 'only_matching': True,
}]
_FORMAT_IDS = {
@@ -94,24 +130,6 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'1080': ('80', '108'),
}
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
- self.report_login()
- login_url = 'https://www.crunchyroll.com/?a=formhandler'
- data = urlencode_postdata({
- 'formname': 'RpcApiUser_Login',
- 'name': username,
- 'password': password,
- })
- login_request = compat_urllib_request.Request(login_url, data)
- login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- self._download_webpage(login_request, None, False, 'Wrong login info')
-
- def _real_initialize(self):
- self._login()
-
def _decrypt_subtitles(self, data, iv, id):
data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
@@ -217,7 +235,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output
def _extract_subtitles(self, subtitle):
- sub_root = xml.etree.ElementTree.fromstring(subtitle)
+ sub_root = compat_etree_fromstring(subtitle)
return [{
'ext': 'srt',
'data': self._convert_subtitles_to_srt(sub_root),
@@ -228,7 +246,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
def _get_subtitles(self, video_id, webpage):
subtitles = {}
- for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
+ for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
sub_page = self._download_webpage(
'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
video_id, note='Downloading subtitles for ' + sub_name)
@@ -254,7 +272,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
else:
webpage_url = 'http://www.' + mobj.group('url')
- webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
+ webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage')
note_m = self._html_search_regex(
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
@@ -270,11 +288,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
if 'To view this, please log in to verify you are 18 or older.' in webpage:
self.raise_login_required()
- video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
+ video_title = self._html_search_regex(
+ r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
+ webpage, 'video_title')
video_title = re.sub(r' {2,}', ' ', video_title)
- video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
- if not video_description:
- video_description = None
+ video_description = self._html_search_regex(
+ r'<script[^>]*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id,
+ webpage, 'description', default=None)
+ if video_description:
+ video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
video_upload_date = self._html_search_regex(
[r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'],
webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)
@@ -352,7 +374,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = "crunchyroll:playlist"
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
@@ -361,12 +383,25 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
},
'playlist_count': 13,
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium available
+ 'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
+ 'info_dict': {
+ 'id': 'cosplay-complex-ova',
+ 'title': 'Cosplay Complex OVA'
+ },
+ 'playlist_count': 3,
+ 'skip': 'Georestricted',
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
+ 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
+ 'only_matching': True,
}]
def _real_extract(self, url):
show_id = self._match_id(url)
- webpage = self._download_webpage(url, show_id)
+ webpage = self._download_webpage(self._add_skip_wall(url), show_id)
title = self._html_search_regex(
r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
webpage, 'title')
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 2d90b2224..bc7823931 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'uploader': 'HotWaves1012',
'age_limit': 18,
}
+ },
+ # geo-restricted, player v5
+ {
+ 'url': 'http://www.dailymotion.com/video/xhza0o',
+ 'only_matching': True,
}
]
@@ -119,11 +124,14 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
webpage, 'comment count', fatal=False))
player_v5 = self._search_regex(
- r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
+ [r'buildPlayer\(({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);'],
webpage, 'player v5', default=None)
if player_v5:
player = self._parse_json(player_v5, video_id)
metadata = player['metadata']
+
+ self._check_error(metadata)
+
formats = []
for quality, media_list in metadata['qualities'].items():
for media in media_list:
@@ -133,9 +141,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
type_ = media.get('type')
if type_ == 'application/vnd.lumberjack.manifest':
continue
- if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', m3u8_id='hls'))
+ ext = determine_ext(media_url)
+ if type_ == 'application/x-mpegURL' or ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ elif type_ == 'application/f4m' or ext == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ media_url, video_id, preference=-1, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
else:
f = {
'url': media_url,
@@ -201,9 +217,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'video info', flags=re.MULTILINE),
video_id)
- if info.get('error') is not None:
- msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
- raise ExtractorError(msg, expected=True)
+ self._check_error(info)
formats = []
for (key, format_id) in self._FORMATS:
@@ -246,6 +260,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'duration': info['duration']
}
+ def _check_error(self, info):
+ if info.get('error') is not None:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True)
+
def _get_subtitles(self, video_id, webpage):
try:
sub_list = self._download_webpage(
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
new file mode 100644
index 000000000..6cd395e11
--- /dev/null
+++ b/youtube_dl/extractor/democracynow.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import os.path
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ url_basename,
+ remove_start,
+)
+
+
+class DemocracynowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P<id>[^\?]*)'
+ IE_NAME = 'democracynow'
+ _TESTS = [{
+ 'url': 'http://www.democracynow.org/shows/2015/7/3',
+ 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d',
+ 'info_dict': {
+ 'id': '2015-0703-001',
+ 'ext': 'mp4',
+ 'title': 'July 03, 2015 - Democracy Now!',
+ 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs',
+ },
+ }, {
+ 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
+ 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d',
+ 'info_dict': {
+ 'id': '2015-0703-001',
+ 'ext': 'mp4',
+ 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
+ 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ description = self._og_search_description(webpage)
+
+ json_data = self._parse_json(self._search_regex(
+ r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'),
+ display_id)
+ video_id = None
+ formats = []
+
+ default_lang = 'en'
+
+ subtitles = {}
+
+ def add_subtitle_item(lang, info_dict):
+ if lang not in subtitles:
+ subtitles[lang] = []
+ subtitles[lang].append(info_dict)
+
+ # chapter_file are not subtitles
+ if 'caption_file' in json_data:
+ add_subtitle_item(default_lang, {
+ 'url': compat_urlparse.urljoin(url, json_data['caption_file']),
+ })
+
+ for subtitle_item in json_data.get('captions', []):
+ lang = subtitle_item.get('language', '').lower() or default_lang
+ add_subtitle_item(lang, {
+ 'url': compat_urlparse.urljoin(url, subtitle_item['url']),
+ })
+
+ for key in ('file', 'audio', 'video'):
+ media_url = json_data.get(key, '')
+ if not media_url:
+ continue
+ media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
+ video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
+ formats.append({
+ 'url': media_url,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id or display_id,
+ 'title': json_data['title'],
+ 'description': description,
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py
deleted file mode 100644
index b88379e06..000000000
--- a/youtube_dl/extractor/divxstage.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from __future__ import unicode_literals
-
-from .novamov import NovaMovIE
-
-
-class DivxStageIE(NovaMovIE):
- IE_NAME = 'divxstage'
- IE_DESC = 'DivxStage'
-
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'}
-
- _HOST = 'www.divxstage.eu'
-
- _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
- _TITLE_REGEX = r'<div class="video_det">\s*<strong>([^<]+)</strong>'
- _DESCRIPTION_REGEX = r'<div class="video_det">\s*<strong>[^<]+</strong>\s*<p>([^<]+)</p>'
-
- _TEST = {
- 'url': 'http://www.divxstage.eu/video/57f238e2e5e01',
- 'md5': '63969f6eb26533a1968c4d325be63e72',
- 'info_dict': {
- 'id': '57f238e2e5e01',
- 'ext': 'flv',
- 'title': 'youtubedl test video',
- 'description': 'This is a test video for youtubedl.',
- }
- }
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
index a1ee51568..7bbf617d4 100644
--- a/youtube_dl/extractor/eagleplatform.py
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -21,7 +21,7 @@ class EaglePlatformIE(InfoExtractor):
_TESTS = [{
# http://lenta.ru/news/2015/03/06/navalny/
'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
- 'md5': '0b7994faa2bd5c0f69a3db6db28d078d',
+ 'md5': '70f5187fb620f2c1d503b3b22fd4efe3',
'info_dict': {
'id': '227304',
'ext': 'mp4',
@@ -36,7 +36,7 @@ class EaglePlatformIE(InfoExtractor):
# http://muz-tv.ru/play/7129/
# http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true
'url': 'eagleplatform:media.clipyou.ru:12820',
- 'md5': '6c2ebeab03b739597ce8d86339d5a905',
+ 'md5': '90b26344ba442c8e44aa4cf8f301164a',
'info_dict': {
'id': '12820',
'ext': 'mp4',
@@ -48,7 +48,8 @@ class EaglePlatformIE(InfoExtractor):
'skip': 'Georestricted',
}]
- def _handle_error(self, response):
+ @staticmethod
+ def _handle_error(response):
status = int_or_none(response.get('status', 200))
if status != 200:
raise ExtractorError(' '.join(response['errors']), expected=True)
@@ -58,6 +59,9 @@ class EaglePlatformIE(InfoExtractor):
self._handle_error(response)
return response
+ def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):
+ return self._download_json(url_or_request, video_id, note)['data'][0]
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
@@ -69,7 +73,7 @@ class EaglePlatformIE(InfoExtractor):
title = media['title']
description = media.get('description')
- thumbnail = media.get('snapshot')
+ thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:')
duration = int_or_none(media.get('duration'))
view_count = int_or_none(media.get('views'))
@@ -78,13 +82,20 @@ class EaglePlatformIE(InfoExtractor):
if age_restriction:
age_limit = 0 if age_restriction == 'allow_all' else 18
- m3u8_data = self._download_json(
- self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:'),
- video_id, 'Downloading m3u8 JSON')
+ secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')
+ m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
formats = self._extract_m3u8_formats(
- m3u8_data['data'][0], video_id,
- 'mp4', entry_protocol='m3u8_native')
+ m3u8_url, video_id,
+ 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+
+ mp4_url = self._get_video_url(
+ # Secure mp4 URL is constructed according to Player.prototype.mp4 from
+ # http://lentaru.media.eagleplatform.com/player/player.js
+ re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4', secure_m3u8),
+ video_id, 'Downloading mp4 JSON')
+ formats.append({'url': mp4_url, 'format_id': 'mp4'})
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py
index 2cba82532..357a2196c 100644
--- a/youtube_dl/extractor/eitb.py
+++ b/youtube_dl/extractor/eitb.py
@@ -1,39 +1,92 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .brightcove import BrightcoveIE
-from ..utils import ExtractorError
+from ..compat import compat_urllib_request
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+)
class EitbIE(InfoExtractor):
IE_NAME = 'eitb.tv'
- _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)'
_TEST = {
- 'add_ie': ['Brightcove'],
- 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/',
+ 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/',
'md5': 'edf4436247185adee3ea18ce64c47998',
'info_dict': {
- 'id': '2743577154001',
+ 'id': '4090227752001',
'ext': 'mp4',
'title': '60 minutos (Lasa y Zabala, 30 años)',
- # All videos from eitb has this description in the brightcove info
- 'description': '.',
- 'uploader': 'Euskal Telebista',
+ 'description': 'Programa de reportajes de actualidad.',
+ 'duration': 3996.76,
+ 'timestamp': 1381789200,
+ 'upload_date': '20131014',
+ 'tags': list,
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- chapter_id = mobj.group('chapter_id')
- webpage = self._download_webpage(url, chapter_id)
- bc_url = BrightcoveIE._extract_brightcove_url(webpage)
- if bc_url is None:
- raise ExtractorError('Could not extract the Brightcove url')
- # The BrightcoveExperience object doesn't contain the video id, we set
- # it manually
- bc_url += '&%40videoPlayer={0}'.format(chapter_id)
- return self.url_result(bc_url, BrightcoveIE.ie_key())
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id,
+ video_id, 'Downloading video JSON')
+
+ media = video['web_media'][0]
+
+ formats = []
+ for rendition in media['RENDITIONS']:
+ video_url = rendition.get('PMD_URL')
+ if not video_url:
+ continue
+ tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000)
+ format_id = 'http'
+ if tbr:
+ format_id += '-%d' % int(tbr)
+ formats.append({
+ 'url': rendition['PMD_URL'],
+ 'format_id': format_id,
+ 'width': int_or_none(rendition.get('FRAME_WIDTH')),
+ 'height': int_or_none(rendition.get('FRAME_HEIGHT')),
+ 'tbr': tbr,
+ })
+
+ hls_url = media.get('HLS_SURL')
+ if hls_url:
+ request = compat_urllib_request.Request(
+ 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/',
+ headers={'Referer': url})
+ token_data = self._download_json(
+ request, video_id, 'Downloading auth token', fatal=False)
+ if token_data:
+ token = token_data.get('token')
+ if token:
+ m3u8_formats = self._extract_m3u8_formats(
+ '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+
+ hds_url = media.get('HDS_SURL')
+ if hds_url:
+ f4m_formats = self._extract_f4m_formats(
+ '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'),
+ video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'],
+ 'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'),
+ 'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'),
+ 'duration': float_or_none(media.get('LENGTH'), 1000),
+ 'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '),
+ 'tags': media.get('TAGS'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index 4ea37ebd9..e4180701d 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -10,7 +10,7 @@ from ..utils import (
class EngadgetIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://www.engadget.com/
- (?:video/5min/(?P<id>\d+)|
+ (?:video(?:/5min)?/(?P<id>\d+)|
[\d/]+/.*?)
'''
diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py
new file mode 100644
index 000000000..adc43919e
--- /dev/null
+++ b/youtube_dl/extractor/europa.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ orderedSet,
+ parse_duration,
+ qualities,
+ unified_strdate,
+ xpath_text
+)
+
+
+class EuropaIE(InfoExtractor):
+ _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
+ 'md5': '574f080699ddd1e19a675b0ddf010371',
+ 'info_dict': {
+ 'id': 'I107758',
+ 'ext': 'mp4',
+ 'title': 'TRADE - Wikileaks on TTIP',
+ 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20150811',
+ 'duration': 34,
+ 'view_count': int,
+ 'formats': 'mincount:3',
+ }
+ }, {
+ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ playlist = self._download_xml(
+ 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id)
+
+ def get_item(type_, preference):
+ items = {}
+ for item in playlist.findall('./info/%s/item' % type_):
+ lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+ if lang and label:
+ items[lang] = label.strip()
+ for p in preference:
+ if items.get(p):
+ return items[p]
+
+ query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ preferred_lang = query.get('sitelang', ('en', ))[0]
+
+ preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
+
+ title = get_item('title', preferred_langs) or video_id
+ description = get_item('description', preferred_langs)
+ thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail')
+ upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
+ duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
+ view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
+
+ language_preference = qualities(preferred_langs[::-1])
+
+ formats = []
+ for file_ in playlist.findall('./files/file'):
+ video_url = xpath_text(file_, './url')
+ if not video_url:
+ continue
+ lang = xpath_text(file_, './lg')
+ formats.append({
+ 'url': video_url,
+ 'format_id': lang,
+ 'format_note': xpath_text(file_, './lglabel'),
+ 'language_preference': language_preference(lang)
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnmail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py
index a38b773e8..1585a03bb 100644
--- a/youtube_dl/extractor/expotv.py
+++ b/youtube_dl/extractor/expotv.py
@@ -33,20 +33,27 @@ class ExpoTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
player_key = self._search_regex(
r'<param name="playerKey" value="([^"]+)"', webpage, 'player key')
- config_url = 'http://client.expotv.com/video/config/%s/%s' % (
- video_id, player_key)
config = self._download_json(
- config_url, video_id,
- note='Downloading video configuration')
+ 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key),
+ video_id, 'Downloading video configuration')
- formats = [{
- 'url': fcfg['file'],
- 'height': int_or_none(fcfg.get('height')),
- 'format_note': fcfg.get('label'),
- 'ext': self._search_regex(
- r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'],
- 'file extension', default=None),
- } for fcfg in config['sources']]
+ formats = []
+ for fcfg in config['sources']:
+ media_url = fcfg.get('file')
+ if not media_url:
+ continue
+ if fcfg.get('type') == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': media_url,
+ 'height': int_or_none(fcfg.get('height')),
+ 'format_id': fcfg.get('label'),
+ 'ext': self._search_regex(
+ r'filename=.*\.([a-z0-9_A-Z]+)&', media_url,
+ 'file extension', default=None) or fcfg.get('type'),
+ })
self._sort_formats(formats)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
index c826a5404..c5677c82b 100644
--- a/youtube_dl/extractor/extremetube.py
+++ b/youtube_dl/extractor/extremetube.py
@@ -3,23 +3,20 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_request
from ..utils import (
- qualities,
+ int_or_none,
str_to_int,
)
class ExtremeTubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<id>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)'
_TESTS = [{
'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'md5': '344d0c6d50e2f16b06e49ca011d8ac69',
'info_dict': {
- 'id': '652431',
+ 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'ext': 'mp4',
'title': 'Music Video 14 british euro brit european cumshots swallow',
'uploader': 'unknown',
@@ -29,12 +26,16 @@ class ExtremeTubeIE(InfoExtractor):
}, {
'url': 'http://www.extremetube.com/gay/video/abcde-1234',
'only_matching': True,
+ }, {
+ 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.extremetube.com/video/652431',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- url = 'http://www.' + mobj.group('url')
+ video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -49,20 +50,36 @@ class ExtremeTubeIE(InfoExtractor):
r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>',
webpage, 'view count', fatal=False))
- flash_vars = compat_parse_qs(self._search_regex(
- r'<param[^>]+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars'))
+ flash_vars = self._parse_json(
+ self._search_regex(
+ r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'),
+ video_id)
formats = []
- quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p'])
- for k, vals in flash_vars.items():
- m = re.match(r'quality_(?P<quality>[0-9]+p)$', k)
- if m is not None:
- formats.append({
- 'format_id': m.group('quality'),
- 'quality': quality(m.group('quality')),
- 'url': vals[0],
+ for quality_key, video_url in flash_vars.items():
+ height = int_or_none(self._search_regex(
+ r'quality_(\d+)[pP]$', quality_key, 'height', default=None))
+ if not height:
+ continue
+ f = {
+ 'url': video_url,
+ }
+ mobj = re.search(
+ r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
+ if mobj:
+ height = int(mobj.group('height'))
+ bitrate = int(mobj.group('bitrate'))
+ f.update({
+ 'format_id': '%dp-%dk' % (height, bitrate),
+ 'height': height,
+ 'tbr': bitrate,
})
-
+ else:
+ f.update({
+ 'format_id': '%dp' % height,
+ 'height': height,
+ })
+ formats.append(f)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 178a7ca4c..f53c51615 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -14,7 +14,6 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
- int_or_none,
limit_length,
urlencode_postdata,
get_element_by_id,
@@ -142,16 +141,20 @@ class FacebookIE(InfoExtractor):
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse_unquote(data['params'])
params = json.loads(params_raw)
- video_data = params['video_data'][0]
formats = []
- for quality in ['sd', 'hd']:
- src = video_data.get('%s_src' % quality)
- if src is not None:
- formats.append({
- 'format_id': quality,
- 'url': src,
- })
+ for format_id, f in params['video_data'].items():
+ if not f or not isinstance(f, list):
+ continue
+ for quality in ('sd', 'hd'):
+ for src_type in ('src', 'src_no_ratelimit'):
+ src = f[0].get('%s_%s' % (quality, src_type))
+ if src:
+ formats.append({
+ 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
+ 'url': src,
+ 'preference': -10 if format_id == 'progressive' else 0,
+ })
if not formats:
raise ExtractorError('Cannot find video formats')
@@ -171,7 +174,5 @@ class FacebookIE(InfoExtractor):
'id': video_id,
'title': video_title,
'formats': formats,
- 'duration': int_or_none(video_data.get('video_duration')),
- 'thumbnail': video_data.get('thumbnail_src'),
'uploader': uploader,
}
diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py
new file mode 100644
index 000000000..f1f150ef2
--- /dev/null
+++ b/youtube_dl/extractor/fczenit.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class FczenitIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://fc-zenit.ru/video/gl6785/',
+ 'md5': '458bacc24549173fe5a5aa29174a5606',
+ 'info_dict': {
+ 'id': '6785',
+ 'ext': 'mp4',
+ 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+ bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
+ bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+
+ formats = [{
+ 'url': furl,
+ 'tbr': tbr,
+ } for furl, tbr in bitrates]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 157094e8c..2955965d9 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -2,11 +2,15 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
- compat_str,
compat_urllib_parse,
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
+ parse_duration,
+ replace_extension,
)
@@ -28,6 +32,7 @@ class FiveMinIE(InfoExtractor):
'id': '518013791',
'ext': 'mp4',
'title': 'iPad Mini with Retina Display Review',
+ 'duration': 177,
},
},
{
@@ -38,9 +43,52 @@ class FiveMinIE(InfoExtractor):
'id': '518086247',
'ext': 'mp4',
'title': 'How to Make a Next-Level Fruit Salad',
+ 'duration': 184,
},
},
]
+ _ERRORS = {
+ 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.',
+ 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.',
+ 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.',
+ 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.',
+ 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
+ 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
+ }
+ _QUALITIES = {
+ 1: {
+ 'width': 640,
+ 'height': 360,
+ },
+ 2: {
+ 'width': 854,
+ 'height': 480,
+ },
+ 4: {
+ 'width': 1280,
+ 'height': 720,
+ },
+ 8: {
+ 'width': 1920,
+ 'height': 1080,
+ },
+ 16: {
+ 'width': 640,
+ 'height': 360,
+ },
+ 32: {
+ 'width': 854,
+ 'height': 480,
+ },
+ 64: {
+ 'width': 1280,
+ 'height': 720,
+ },
+ 128: {
+ 'width': 640,
+ 'height': 360,
+ },
+ }
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -59,26 +107,36 @@ class FiveMinIE(InfoExtractor):
'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
video_id)
if not response['success']:
- err_msg = response['errorMessage']
- if err_msg == 'ErrorVideoUserNotGeo':
- msg = 'Video not available from your location'
- else:
- msg = 'Aol said: %s' % err_msg
- raise ExtractorError(msg, expected=True, video_id=video_id)
+ raise ExtractorError(
+ '%s said: %s' % (
+ self.IE_NAME,
+ self._ERRORS.get(response['errorMessage'], response['errorMessage'])),
+ expected=True)
info = response['binding'][0]
- second_id = compat_str(int(video_id[:-2]) + 1)
formats = []
- for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]:
- if any(r['ID'] == quality for r in info['Renditions']):
+ parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
+ compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
+ for rendition in info['Renditions']:
+ if rendition['RenditionType'] == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
+ elif rendition['RenditionType'] == 'aac':
+ continue
+ else:
+ rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
+ quality = self._QUALITIES.get(rendition['ID'], {})
formats.append({
- 'format_id': compat_str(quality),
- 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality),
- 'height': height,
+ 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']),
+ 'url': rendition_url,
+ 'width': quality.get('width'),
+ 'height': quality.get('height'),
})
+ self._sort_formats(formats)
return {
'id': video_id,
'title': info['Title'],
+ 'thumbnail': info.get('ThumbURL'),
+ 'duration': parse_duration(info.get('Duration')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index 190d9f9ad..40ea27895 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -1,13 +1,12 @@
from __future__ import unicode_literals
import re
-import random
-import json
from .common import InfoExtractor
from ..utils import (
- get_element_by_id,
clean_html,
+ determine_ext,
+ ExtractorError,
)
@@ -17,66 +16,40 @@ class FKTVIE(InfoExtractor):
_TEST = {
'url': 'http://fernsehkritik.tv/folge-1',
+ 'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79',
'info_dict': {
- 'id': '00011',
- 'ext': 'flv',
+ 'id': '1',
+ 'ext': 'mp4',
'title': 'Folge 1 vom 10. April 2007',
- 'description': 'md5:fb4818139c7cfe6907d4b83412a6864f',
+ 'thumbnail': 're:^https?://.*\.jpg$',
},
}
def _real_extract(self, url):
- episode = int(self._match_id(url))
-
- video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode
- start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode,
- episode)
- playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage,
- 'playlist', flags=re.DOTALL)
- files = json.loads(re.sub('{[^{}]*?}', '{}', playlist))
-
- videos = []
- for i, _ in enumerate(files, 1):
- video_id = '%04d%d' % (episode, i)
- video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i)
- videos.append({
- 'ext': 'flv',
- 'id': video_id,
- 'url': video_url,
- 'title': clean_html(get_element_by_id('eptitle', start_webpage)),
- 'description': clean_html(get_element_by_id('contentlist', start_webpage)),
- 'thumbnail': video_thumbnail
- })
- return {
- '_type': 'multi_video',
- 'entries': videos,
- 'id': 'folge-%s' % episode,
- }
-
-
-class FKTVPosteckeIE(InfoExtractor):
- IE_NAME = 'fernsehkritik.tv:postecke'
- _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
- _TEST = {
- 'url': 'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
- 'md5': '262f0adbac80317412f7e57b4808e5c4',
- 'info_dict': {
- 'id': '0120',
- 'ext': 'flv',
- 'title': 'Postecke 120',
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = int(mobj.group('ep'))
-
- server = random.randint(2, 4)
- video_id = '%04d' % episode
- video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode)
- video_title = 'Postecke %d' % episode
+ episode = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://fernsehkritik.tv/folge-%s/play' % episode, episode)
+ title = clean_html(self._html_search_regex(
+ '<h3>([^<]+)</h3>', webpage, 'title'))
+ matches = re.search(
+ r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>',
+ webpage)
+ if matches is None:
+ raise ExtractorError('Unable to extract the video')
+
+ poster, sources = matches.groups()
+ if poster is None:
+ self.report_warning('unable to extract thumbnail')
+
+ urls = re.findall(r'<source[^>]+src="([^"]+)"', sources)
+ formats = [{
+ 'url': furl,
+ 'format_id': determine_ext(furl),
+ } for furl in urls]
return {
- 'id': video_id,
- 'url': video_url,
- 'title': video_title,
+ 'id': episode,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': poster,
}
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index 3bb4f6239..fb6d108c0 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -46,10 +46,10 @@ class FourTubeIE(InfoExtractor):
thumbnail = self._html_search_meta('thumbnailUrl', webpage)
uploader_id = self._html_search_regex(
r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
- webpage, 'uploader id')
+ webpage, 'uploader id', fatal=False)
uploader = self._html_search_regex(
r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
- webpage, 'uploader')
+ webpage, 'uploader', fatal=False)
categories_html = self._search_regex(
r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>',
@@ -68,13 +68,24 @@ class FourTubeIE(InfoExtractor):
webpage, 'like count', fatal=False))
duration = parse_duration(self._html_search_meta('duration', webpage))
- params_js = self._search_regex(
- r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
- webpage, 'initialization parameters'
- )
- params = self._parse_json('[%s]' % params_js, video_id)
- media_id = params[0]
- sources = ['%s' % p for p in params[2]]
+ media_id = self._search_regex(
+ r'<button[^>]+data-id=(["\'])(?P<id>\d+)\1[^>]+data-quality=', webpage,
+ 'media id', default=None, group='id')
+ sources = [
+ quality
+ for _, quality in re.findall(r'<button[^>]+data-quality=(["\'])(.+?)\1', webpage)]
+ if not (media_id and sources):
+ player_js = self._download_webpage(
+ self._search_regex(
+ r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2',
+ webpage, 'player JS', group='url'),
+ video_id, 'Downloading player JS')
+ params_js = self._search_regex(
+ r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
+ player_js, 'initialization parameters')
+ params = self._parse_json('[%s]' % params_js, video_id)
+ media_id = params[0]
+ sources = ['%s' % p for p in params[2]]
token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format(
media_id, '+'.join(sources))
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 129984a5f..8e60cf60f 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -83,6 +83,14 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
if subtitle:
title += ' - %s' % subtitle
+ subtitles = {}
+ subtitles_list = [{
+ 'url': subformat['url'],
+ 'ext': subformat.get('format'),
+ } for subformat in info.get('subtitles', []) if subformat.get('url')]
+ if subtitles_list:
+ subtitles['fr'] = subtitles_list
+
return {
'id': video_id,
'title': title,
@@ -91,20 +99,27 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'formats': formats,
+ 'subtitles': subtitles,
}
class PluzzIE(FranceTVBaseInfoExtractor):
IE_NAME = 'pluzz.francetv.fr'
- _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html'
+ _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html'
# Can't use tests, videos expire in 7 days
def _real_extract(self, url):
- title = re.match(self._VALID_URL, url).group(1)
- webpage = self._download_webpage(url, title)
- video_id = self._search_regex(
- r'data-diffusion="(\d+)"', webpage, 'ID')
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._html_search_meta(
+ 'id_video', webpage, 'video id', default=None)
+ if not video_id:
+ video_id = self._search_regex(
+ r'data-diffusion=["\'](\d+)', webpage, 'video id')
+
return self._extract_video(video_id, 'Pluzz')
@@ -120,6 +135,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'title': 'Soir 3',
'upload_date': '20130826',
'timestamp': 1377548400,
+ 'subtitles': {
+ 'fr': 'mincount:2',
+ },
},
}, {
'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index f5f13689c..7f21d7410 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -45,11 +45,20 @@ class FunnyOrDieIE(InfoExtractor):
links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
- bitrates = self._html_search_regex(r'<source src="[^"]+/v,((?:\d+,)+)\.mp4\.csmil', webpage, 'video bitrates')
- bitrates = [int(b) for b in bitrates.rstrip(',').split(',')]
- bitrates.sort()
+ m3u8_url = self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1',
+ webpage, 'm3u8 url', default=None, group='url')
formats = []
+
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+
+ bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)[,/]', m3u8_url)]
+ bitrates.sort()
+
for bitrate in bitrates:
for link in links:
formats.append({
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7a3a7f66b..34d930a2d 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
import os
import re
+import sys
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
+ compat_etree_fromstring,
compat_urllib_parse_unquote,
compat_urllib_request,
compat_urlparse,
@@ -20,7 +22,6 @@ from ..utils import (
HEADRequest,
is_html,
orderedSet,
- parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
@@ -52,6 +53,7 @@ from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
from .snagfilms import SnagFilmsEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
+from .mtv import MTVServicesEmbeddedIE
class GenericIE(InfoExtractor):
@@ -142,6 +144,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'title': 'Automatics, robotics and biocybernetics',
'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'upload_date': '20130627',
'formats': 'mincount:16',
'subtitles': 'mincount:1',
},
@@ -234,6 +237,22 @@ class GenericIE(InfoExtractor):
}
},
{
+ # redirect in Refresh HTTP header
+ 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1',
+ 'info_dict': {
+ 'id': 'pO8h3EaFRdo',
+ 'ext': 'mp4',
+ 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+ 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5',
+ 'upload_date': '20150917',
+ 'uploader_id': 'brtvofficial',
+ 'uploader': 'Boiler Room',
+ },
+ 'params': {
+ 'skip_download': False,
+ },
+ },
+ {
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
'info_dict': {
@@ -1233,7 +1252,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed, a SMIL file or a XSPF playlist?
try:
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@@ -1613,12 +1632,9 @@ class GenericIE(InfoExtractor):
return self.url_result(url, ie='Vulture')
# Look for embedded mtvservices player
- mobj = re.search(
- r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
- webpage)
- if mobj is not None:
- url = unescapeHTML(mobj.group('url'))
- return self.url_result(url, ie='MTVServicesEmbedded')
+ mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
+ if mtvservices_url:
+ return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
# Look for embedded yahoo player
mobj = re.search(
@@ -1657,7 +1673,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'MLB')
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+ r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
webpage)
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
@@ -1675,8 +1691,8 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
- re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
if mobj is not None:
return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
@@ -1827,6 +1843,9 @@ class GenericIE(InfoExtractor):
# Look also in Refresh HTTP header
refresh_header = head_response.headers.get('Refresh')
if refresh_header:
+ # In python 2 response HTTP headers are bytestrings
+ if sys.version_info < (3, 0) and isinstance(refresh_header, str):
+ refresh_header = refresh_header.decode('iso-8859-1')
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py
index 33d6432a6..c65ef6bcf 100644
--- a/youtube_dl/extractor/globo.py
+++ b/youtube_dl/extractor/globo.py
@@ -14,79 +14,58 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ str_or_none,
)
class GloboIE(InfoExtractor):
- _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)'
+ _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})'
_API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist'
_SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s'
- _VIDEOID_REGEXES = [
- r'\bdata-video-id="(\d+)"',
- r'\bdata-player-videosids="(\d+)"',
- r'<div[^>]+\bid="(\d+)"',
- ]
-
_RESIGN_EXPIRATION = 86400
- _TESTS = [
- {
- 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/',
- 'md5': '03ebf41cb7ade43581608b7d9b71fab0',
- 'info_dict': {
- 'id': '3654973',
- 'ext': 'mp4',
- 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão',
- 'duration': 251.585,
- 'uploader': 'SporTV',
- 'uploader_id': 698,
- 'like_count': int,
- }
- },
- {
- 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
- 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
- 'info_dict': {
- 'id': '3607726',
- 'ext': 'mp4',
- 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
- 'duration': 103.204,
- 'uploader': 'Globo.com',
- 'uploader_id': 265,
- 'like_count': int,
- }
- },
- {
- 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
- 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b',
- 'info_dict': {
- 'id': '3652183',
- 'ext': 'mp4',
- 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião',
- 'duration': 110.711,
- 'uploader': 'Rede Globo',
- 'uploader_id': 196,
- 'like_count': int,
- }
+ _TESTS = [{
+ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
+ 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
+ 'info_dict': {
+ 'id': '3607726',
+ 'ext': 'mp4',
+ 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
+ 'duration': 103.204,
+ 'uploader': 'Globo.com',
+ 'uploader_id': '265',
},
- {
- 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
- 'md5': 'c1defca721ce25b2354e927d3e4b3dec',
- 'info_dict': {
- 'id': '3928201',
- 'ext': 'mp4',
- 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas',
- 'duration': 1472.906,
- 'uploader': 'Canal Brasil',
- 'uploader_id': 705,
- 'like_count': int,
- }
+ }, {
+ 'url': 'http://globoplay.globo.com/v/4581987/',
+ 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff',
+ 'info_dict': {
+ 'id': '4581987',
+ 'ext': 'mp4',
+ 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
+ 'duration': 137.973,
+ 'uploader': 'Rede Globo',
+ 'uploader_id': '196',
},
- ]
-
- class MD5():
+ }, {
+ 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
+ 'only_matching': True,
+ }]
+
+ class MD5:
HEX_FORMAT_LOWERCASE = 0
HEX_FORMAT_UPPERCASE = 1
BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = ''
@@ -353,9 +332,6 @@ class GloboIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id')
-
video = self._download_json(
self._API_URL_TEMPLATE % video_id, video_id)['videos'][0]
@@ -364,7 +340,7 @@ class GloboIE(InfoExtractor):
formats = []
for resource in video['resources']:
resource_id = resource.get('_id')
- if not resource_id:
+ if not resource_id or resource_id.endswith('manifest'):
continue
security = self._download_json(
@@ -393,20 +369,23 @@ class GloboIE(InfoExtractor):
resource_url = resource['url']
signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash')
if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(signed_url, resource_id, 'mp4'))
+ m3u8_formats = self._extract_m3u8_formats(
+ signed_url, resource_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
else:
formats.append({
'url': signed_url,
- 'format_id': resource_id,
- 'height': resource.get('height'),
+ 'format_id': 'http-%s' % resource_id,
+ 'height': int_or_none(resource.get('height')),
})
self._sort_formats(formats)
duration = float_or_none(video.get('duration'), 1000)
- like_count = int_or_none(video.get('likes'))
uploader = video.get('channel')
- uploader_id = video.get('channel_id')
+ uploader_id = str_or_none(video.get('channel_id'))
return {
'id': video_id,
@@ -414,6 +393,46 @@ class GloboIE(InfoExtractor):
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
- 'like_count': like_count,
'formats': formats
}
+
+
+class GloboArticleIE(InfoExtractor):
+ _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html'
+
+ _VIDEOID_REGEXES = [
+ r'\bdata-video-id=["\'](\d{7,})',
+ r'\bdata-player-videosids=["\'](\d{7,})',
+ r'\bvideosIDs\s*:\s*["\'](\d{7,})',
+ r'\bdata-id=["\'](\d{7,})',
+ r'<div[^>]+\bid=["\'](\d{7,})',
+ ]
+
+ _TESTS = [{
+ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
+ 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b',
+ 'info_dict': {
+ 'id': '3652183',
+ 'ext': 'mp4',
+ 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião',
+ 'duration': 110.711,
+ 'uploader': 'Rede Globo',
+ 'uploader_id': '196',
+ }
+ }, {
+ 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id')
+ return self.url_result('globo:%s' % video_id, 'Globo')
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index fcefe54cd..731bacd67 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -61,7 +61,7 @@ class GooglePlusIE(InfoExtractor):
'width': int(width),
'height': int(height),
} for width, height, video_url in re.findall(
- r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)]
+ r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)]
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py
deleted file mode 100644
index a3154cfde..000000000
--- a/youtube_dl/extractor/hostingbulk.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
- int_or_none,
- urlencode_postdata,
-)
-
-
-class HostingBulkIE(InfoExtractor):
- _VALID_URL = r'''(?x)
- https?://(?:www\.)?hostingbulk\.com/
- (?:embed-)?(?P<id>[A-Za-z0-9]{12})(?:-\d+x\d+)?\.html'''
- _FILE_DELETED_REGEX = r'<b>File Not Found</b>'
- _TEST = {
- 'url': 'http://hostingbulk.com/n0ulw1hv20fm.html',
- 'md5': '6c8653c8ecf7ebfa83b76e24b7b2fe3f',
- 'info_dict': {
- 'id': 'n0ulw1hv20fm',
- 'ext': 'mp4',
- 'title': 'md5:5afeba33f48ec87219c269e054afd622',
- 'filesize': 6816081,
- 'thumbnail': 're:^http://.*\.jpg$',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://hostingbulk.com/{0:}.html'.format(video_id)
-
- # Custom request with cookie to set language to English, so our file
- # deleted regex would work.
- request = compat_urllib_request.Request(
- url, headers={'Cookie': 'lang=english'})
- webpage = self._download_webpage(request, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- title = self._html_search_regex(r'<h3>(.*?)</h3>', webpage, 'title')
- filesize = int_or_none(
- self._search_regex(
- r'<small>\((\d+)\sbytes?\)</small>',
- webpage,
- 'filesize',
- fatal=False
- )
- )
- thumbnail = self._search_regex(
- r'<img src="([^"]+)".+?class="pic"',
- webpage, 'thumbnail', fatal=False)
-
- fields = self._hidden_inputs(webpage)
-
- request = compat_urllib_request.Request(url, urlencode_postdata(fields))
- request.add_header('Content-type', 'application/x-www-form-urlencoded')
- response = self._request_webpage(request, video_id,
- 'Submiting download request')
- video_url = response.geturl()
-
- formats = [{
- 'format_id': 'sd',
- 'filesize': filesize,
- 'url': video_url,
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py
index 70e4c0d41..a39f422e9 100644
--- a/youtube_dl/extractor/iconosquare.py
+++ b/youtube_dl/extractor/iconosquare.py
@@ -1,7 +1,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ get_element_by_id,
+ remove_end,
+)
class IconosquareIE(InfoExtractor):
@@ -12,7 +16,7 @@ class IconosquareIE(InfoExtractor):
'info_dict': {
'id': '522207370455279102_24101272',
'ext': 'mp4',
- 'title': 'Instagram media by @aguynamedpatrick (Patrick Janelle)',
+ 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
'timestamp': 1376471991,
'upload_date': '20130814',
@@ -29,8 +33,7 @@ class IconosquareIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
media = self._parse_json(
- self._search_regex(
- r'window\.media\s*=\s*({.+?});\n', webpage, 'media'),
+ get_element_by_id('mediaJson', webpage),
video_id)
formats = [{
@@ -41,9 +44,7 @@ class IconosquareIE(InfoExtractor):
} for format_id, f in media['videos'].items()]
self._sort_formats(formats)
- title = self._html_search_regex(
- r'<title>(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)</title>',
- webpage, 'title')
+ title = remove_end(self._og_search_title(webpage), ' - via Iconosquare')
timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))
description = media.get('caption', {}).get('text')
@@ -61,6 +62,14 @@ class IconosquareIE(InfoExtractor):
'height': int_or_none(t.get('height'))
} for thumbnail_id, t in media.get('images', {}).items()]
+ comments = [{
+ 'id': comment.get('id'),
+ 'text': comment['text'],
+ 'timestamp': int_or_none(comment.get('created_time')),
+ 'author': comment.get('from', {}).get('full_name'),
+ 'author_id': comment.get('from', {}).get('username'),
+ } for comment in media.get('comments', {}).get('data', []) if 'text' in comment]
+
return {
'id': video_id,
'title': title,
@@ -72,4 +81,5 @@ class IconosquareIE(InfoExtractor):
'comment_count': comment_count,
'like_count': like_count,
'formats': formats,
+ 'comments': comments,
}
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 4bb574cf3..02e1e428e 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -4,8 +4,8 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urlparse,
+from ..utils import (
+ qualities,
)
@@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor):
descr = self._html_search_regex(
r'(?s)<span itemprop="description">(.*?)</span>',
webpage, 'description', fatal=False)
- available_formats = re.findall(
- r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
- flags=re.MULTILINE)
+ player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
+ player_page = self._download_webpage(
+ player_url, video_id, 'Downloading player page')
+ # the player page contains the info for the default format, we have to
+ # fetch other pages for the rest of the formats
+ extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
+ format_pages = [
+ self._download_webpage(
+ f_url, video_id, 'Downloading info for %s format' % f_name)
+ for f_url, f_name in extra_formats]
+ format_pages.append(player_page)
+
+ quality = qualities(['SD', '480p', '720p'])
formats = []
- for f_id, f_path in available_formats:
- f_path = f_path.strip()
- format_page = self._download_webpage(
- compat_urlparse.urljoin(url, f_path),
- 'Downloading info for %s format' % f_id)
+ for format_page in format_pages:
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, 'json data', flags=re.DOTALL)
info = json.loads(json_data)
format_info = info['videoPlayerObject']['video']
+ f_id = format_info['ffname']
formats.append({
'format_id': f_id,
'url': format_info['videoInfoList'][0]['videoUrl'],
+ 'quality': quality(f_id),
})
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index 393e67e35..2df1da3f0 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -95,6 +95,10 @@ class IqiyiIE(InfoExtractor):
('10', 'h1'),
]
+ @staticmethod
+ def md5_text(text):
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
+
def construct_video_urls(self, data, video_id, _uuid):
def do_xor(x, y):
a = y % 3
@@ -121,7 +125,7 @@ class IqiyiIE(InfoExtractor):
note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
)['t']
t = str(int(math.floor(int(tm) / (600.0))))
- return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
+ return self.md5_text(t + mg + x)
video_urls_dict = {}
for format_item in data['vp']['tkl'][0]['vs']:
@@ -179,20 +183,19 @@ class IqiyiIE(InfoExtractor):
def get_raw_data(self, tvid, video_id, enc_key, _uuid):
tm = str(int(time.time()))
+ tail = tm + tvid
param = {
'key': 'fvip',
- 'src': hashlib.md5(b'youtube-dl').hexdigest(),
+ 'src': self.md5_text('youtube-dl'),
'tvId': tvid,
'vid': video_id,
'vinfo': 1,
'tm': tm,
- 'enc': hashlib.md5(
- (enc_key + tm + tvid).encode('utf8')).hexdigest(),
+ 'enc': self.md5_text(enc_key + tail),
'qyid': _uuid,
'tn': random.random(),
'um': 0,
- 'authkey': hashlib.md5(
- (tm + tvid).encode('utf8')).hexdigest()
+ 'authkey': self.md5_text(self.md5_text('') + tail),
}
api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
@@ -201,7 +204,10 @@ class IqiyiIE(InfoExtractor):
return raw_data
def get_enc_key(self, swf_url, video_id):
- enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie
+ # TODO: automatic key extraction
+ # last update at 2015-10-22 for Zombie::bite
+ # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2]
+ enc_key = '2c76de15dcb44bd28ff0927d50d31620'
return enc_key
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index 1df084d87..eef7daa29 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -28,7 +28,7 @@ class JeuxVideoIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
title = mobj.group(1)
webpage = self._download_webpage(url, title)
- title = self._html_search_meta('name', webpage)
+ title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
config_url = self._html_search_regex(
r'data-src="(/contenu/medias/video.php.*?)"',
webpage, 'config URL')
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index 3dca0e566..0dcd6cd05 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -16,7 +16,7 @@ class KalturaIE(InfoExtractor):
(?:
kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
https?://
- (:?(?:www|cdnapisec)\.)?kaltura\.com/
+ (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
(?:
(?:
# flash player
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
index c0956ba09..94a03d277 100644
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -1,46 +1,39 @@
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class KeekIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<id>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?keek\.com/keek/(?P<id>\w+)'
IE_NAME = 'keek'
_TEST = {
- 'url': 'https://www.keek.com/ytdl/keeks/NODfbab',
- 'md5': '09c5c109067536c1cec8bac8c21fea05',
+ 'url': 'https://www.keek.com/keek/NODfbab',
+ 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83',
'info_dict': {
'id': 'NODfbab',
'ext': 'mp4',
- 'uploader': 'youtube-dl project',
- 'uploader_id': 'ytdl',
- 'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .',
+ 'title': 'md5:35d42050a3ece241d5ddd7fdcc6fd896',
+ 'uploader': 'ytdl',
+ 'uploader_id': 'eGT5bab',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
- video_url = 'http://cdn.keek.com/keek/video/%s' % video_id
- thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- raw_desc = self._html_search_meta('description', webpage)
- if raw_desc:
- uploader = self._html_search_regex(
- r'Watch (.*?)\s+\(', raw_desc, 'uploader', fatal=False)
- uploader_id = self._html_search_regex(
- r'Watch .*?\(@(.+?)\)', raw_desc, 'uploader_id', fatal=False)
- else:
- uploader = None
- uploader_id = None
-
return {
'id': video_id,
- 'url': video_url,
+ 'url': self._og_search_video_url(webpage),
'ext': 'mp4',
- 'title': self._og_search_title(webpage),
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
+ 'title': self._og_search_description(webpage).strip(),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': self._search_regex(
+ r'data-username=(["\'])(?P<uploader>.+?)\1', webpage,
+ 'uploader', fatal=False, group='uploader'),
+ 'uploader_id': self._search_regex(
+ r'data-user-id=(["\'])(?P<uploader_id>.+?)\1', webpage,
+ 'uploader id', fatal=False, group='uploader_id'),
}
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index fa233377d..0c8ed5d07 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -57,6 +57,7 @@ class KuwoIE(KuwoBaseIE):
'upload_date': '20080122',
'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
},
+ 'skip': 'this song has been offline because of copyright issues',
}, {
'url': 'http://www.kuwo.cn/yinyue/6446136/',
'info_dict': {
@@ -76,9 +77,11 @@ class KuwoIE(KuwoBaseIE):
webpage = self._download_webpage(
url, song_id, note='Download song detail info',
errnote='Unable to get song detail info')
+ if '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage:
+ raise ExtractorError('this song has been offline because of copyright issues', expected=True)
song_name = self._html_search_regex(
- r'<h1[^>]+title="([^"]+)">', webpage, 'song name')
+ r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?<h1[^>]+title="([^"]+)"', webpage, 'song name')
singer_name = self._html_search_regex(
r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
webpage, 'singer name', fatal=False)
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py
index a28abb0f0..effd9eb92 100644
--- a/youtube_dl/extractor/letv.py
+++ b/youtube_dl/extractor/letv.py
@@ -9,13 +9,14 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urllib_request,
- compat_urlparse,
+ compat_ord,
)
from ..utils import (
determine_ext,
ExtractorError,
parse_iso8601,
int_or_none,
+ encode_data_uri,
)
@@ -25,15 +26,16 @@ class LetvIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.letv.com/ptv/vplay/22005890.html',
- 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df',
+ 'md5': 'edadcfe5406976f42f9f266057ee5e40',
'info_dict': {
'id': '22005890',
'ext': 'mp4',
'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家',
- 'timestamp': 1424747397,
- 'upload_date': '20150224',
'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
- }
+ },
+ 'params': {
+ 'hls_prefer_native': True,
+ },
}, {
'url': 'http://www.letv.com/ptv/vplay/1415246.html',
'info_dict': {
@@ -42,16 +44,22 @@ class LetvIE(InfoExtractor):
'title': '美人天下01',
'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
},
+ 'params': {
+ 'hls_prefer_native': True,
+ },
}, {
'note': 'This video is available only in Mainland China, thus a proxy is needed',
'url': 'http://www.letv.com/ptv/vplay/1118082.html',
- 'md5': 'f80936fbe20fb2f58648e81386ff7927',
+ 'md5': '2424c74948a62e5f31988438979c5ad1',
'info_dict': {
'id': '1118082',
'ext': 'mp4',
'title': '与龙共舞 完整版',
'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
},
+ 'params': {
+ 'hls_prefer_native': True,
+ },
'skip': 'Only available in China',
}]
@@ -74,6 +82,27 @@ class LetvIE(InfoExtractor):
_loc3_ = self.ror(_loc3_, _loc2_ % 17)
return _loc3_
+ # see M3U8Encryption class in KLetvPlayer.swf
+ @staticmethod
+ def decrypt_m3u8(encrypted_data):
+ if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
+ return encrypted_data
+ encrypted_data = encrypted_data[5:]
+
+ _loc4_ = bytearray()
+ while encrypted_data:
+ b = compat_ord(encrypted_data[0])
+ _loc4_.extend([b // 16, b & 0x0f])
+ encrypted_data = encrypted_data[1:]
+ idx = len(_loc4_) - 11
+ _loc4_ = _loc4_[idx:] + _loc4_[:idx]
+ _loc7_ = bytearray()
+ while _loc4_:
+ _loc7_.append(_loc4_[0] * 16 + _loc4_[1])
+ _loc4_ = _loc4_[2:]
+
+ return bytes(_loc7_)
+
def _real_extract(self, url):
media_id = self._match_id(url)
page = self._download_webpage(url, media_id)
@@ -115,23 +144,28 @@ class LetvIE(InfoExtractor):
for format_id in formats:
if format_id in dispatch:
media_url = playurl['domain'][0] + dispatch[format_id][0]
-
- # Mimic what flvxz.com do
- url_parts = list(compat_urlparse.urlparse(media_url))
- qs = dict(compat_urlparse.parse_qs(url_parts[4]))
- qs.update({
- 'platid': '14',
- 'splatid': '1401',
- 'tss': 'no',
- 'retry': 1
+ media_url += '&' + compat_urllib_parse.urlencode({
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'rateid': format_id,
})
- url_parts[4] = compat_urllib_parse.urlencode(qs)
- media_url = compat_urlparse.urlunparse(url_parts)
+
+ nodes_data = self._download_json(
+ media_url, media_id,
+ 'Download JSON metadata for format %s' % format_id)
+
+ req = self._request_webpage(
+ nodes_data['nodelist'][0]['location'], media_id,
+ note='Downloading m3u8 information for format %s' % format_id)
+
+ m3u8_data = self.decrypt_m3u8(req.read())
url_info_dict = {
- 'url': media_url,
+ 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
'ext': determine_ext(dispatch[format_id][1]),
'format_id': format_id,
+ 'protocol': 'm3u8',
}
if format_id[-1:] == 'p':
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
new file mode 100644
index 000000000..fb03dd527
--- /dev/null
+++ b/youtube_dl/extractor/limelight.py
@@ -0,0 +1,229 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+)
+
+
+class LimelightBaseIE(InfoExtractor):
+ _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
+ _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
+
+ def _call_playlist_service(self, item_id, method, fatal=True):
+ return self._download_json(
+ self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method),
+ item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal)
+
+ def _call_api(self, organization_id, item_id, method):
+ return self._download_json(
+ self._API_URL % (organization_id, self._API_PATH, item_id, method),
+ item_id, 'Downloading API %s JSON' % method)
+
+ def _extract(self, item_id, pc_method, mobile_method, meta_method):
+ pc = self._call_playlist_service(item_id, pc_method)
+ metadata = self._call_api(pc['orgId'], item_id, meta_method)
+ mobile = self._call_playlist_service(item_id, mobile_method, fatal=False)
+ return pc, mobile, metadata
+
+ def _extract_info(self, streams, mobile_urls, properties):
+ video_id = properties['media_id']
+ formats = []
+
+ for stream in streams:
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ if '.f4m' in stream_url:
+ formats.extend(self._extract_f4m_formats(stream_url, video_id))
+ else:
+ fmt = {
+ 'url': stream_url,
+ 'abr': float_or_none(stream.get('audioBitRate')),
+ 'vbr': float_or_none(stream.get('videoBitRate')),
+ 'fps': float_or_none(stream.get('videoFrameRate')),
+ 'width': int_or_none(stream.get('videoWidthInPixels')),
+ 'height': int_or_none(stream.get('videoHeightInPixels')),
+ 'ext': determine_ext(stream_url)
+ }
+ rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
+ if rtmp:
+ format_id = 'rtmp'
+ if stream.get('videoBitRate'):
+ format_id += '-%d' % int_or_none(stream['videoBitRate'])
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': format_id,
+ })
+ formats.append(fmt)
+
+ for mobile_url in mobile_urls:
+ media_url = mobile_url.get('mobileUrl')
+ if not media_url:
+ continue
+ format_id = mobile_url.get('targetMediaPlatform')
+ if determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ preference=-1, m3u8_id=format_id))
+ else:
+ formats.append({
+ 'url': media_url,
+ 'format_id': format_id,
+ 'preference': -1,
+ })
+
+ self._sort_formats(formats)
+
+ title = properties['title']
+ description = properties.get('description')
+ timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date'))
+ duration = float_or_none(properties.get('duration_in_milliseconds'), 1000)
+ filesize = int_or_none(properties.get('total_storage_in_bytes'))
+ categories = [properties.get('category')]
+ tags = properties.get('tags', [])
+ thumbnails = [{
+ 'url': thumbnail['url'],
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')]
+
+ subtitles = {}
+ for caption in properties.get('captions', {}):
+ lang = caption.get('language_code')
+ subtitles_url = caption.get('url')
+ if lang and subtitles_url:
+ subtitles[lang] = [{
+ 'url': subtitles_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'filesize': filesize,
+ 'categories': categories,
+ 'tags': tags,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
+ }
+
+
+class LimelightMediaIE(LimelightBaseIE):
+ IE_NAME = 'limelight'
+ _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})'
+ _TESTS = [{
+ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
+ 'info_dict': {
+ 'id': '3ffd040b522b4485b6d84effc750cd86',
+ 'ext': 'flv',
+ 'title': 'HaP and the HB Prince Trailer',
+ 'description': 'md5:8005b944181778e313d95c1237ddb640',
+ 'thumbnail': 're:^https?://.*\.jpeg$',
+ 'duration': 144.23,
+ 'timestamp': 1244136834,
+ 'upload_date': '20090604',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # video with subtitles
+ 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335',
+ 'info_dict': {
+ 'id': 'a3e00274d4564ec4a9b29b9466432335',
+ 'ext': 'flv',
+ 'title': '3Play Media Overview Video',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpeg$',
+ 'duration': 78.101,
+ 'timestamp': 1338929955,
+ 'upload_date': '20120605',
+ 'subtitles': 'mincount:9',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }]
+ _PLAYLIST_SERVICE_PATH = 'media'
+ _API_PATH = 'media'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ pc, mobile, metadata = self._extract(
+ video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties')
+
+ return self._extract_info(
+ pc['playlistItems'][0].get('streams', []),
+ mobile['mediaList'][0].get('mobileUrls', []) if mobile else [],
+ metadata)
+
+
+class LimelightChannelIE(LimelightBaseIE):
+ IE_NAME = 'limelight:channel'
+ _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})'
+ _TEST = {
+ 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082',
+ 'info_dict': {
+ 'id': 'ab6a524c379342f9b23642917020c082',
+ 'title': 'Javascript Sample Code',
+ },
+ 'playlist_mincount': 3,
+ }
+ _PLAYLIST_SERVICE_PATH = 'channel'
+ _API_PATH = 'channels'
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ pc, mobile, medias = self._extract(
+ channel_id, 'getPlaylistByChannelId',
+ 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media')
+
+ entries = [
+ self._extract_info(
+ pc['playlistItems'][i].get('streams', []),
+ mobile['mediaList'][i].get('mobileUrls', []) if mobile else [],
+ medias['media_list'][i])
+ for i in range(len(medias['media_list']))]
+
+ return self.playlist_result(entries, channel_id, pc['title'])
+
+
+class LimelightChannelListIE(LimelightBaseIE):
+ IE_NAME = 'limelight:channel_list'
+ _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})'
+ _TEST = {
+ 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b',
+ 'info_dict': {
+ 'id': '301b117890c4465c8179ede21fd92e2b',
+ 'title': 'Website - Hero Player',
+ },
+ 'playlist_mincount': 2,
+ }
+ _PLAYLIST_SERVICE_PATH = 'channel_list'
+
+ def _real_extract(self, url):
+ channel_list_id = self._match_id(url)
+
+ channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById')
+
+ entries = [
+ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel')
+ for channel in channel_list['channelList']]
+
+ return self.playlist_result(entries, channel_list_id, channel_list['title'])
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 378117270..9a207b2cd 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -82,6 +82,11 @@ class LyndaBaseIE(InfoExtractor):
expected=True)
raise ExtractorError('Unable to log in')
+ def _logout(self):
+ self._download_webpage(
+ 'http://www.lynda.com/ajax/logout.aspx', None,
+ 'Logging out', 'Unable to log out', fatal=False)
+
class LyndaIE(LyndaBaseIE):
IE_NAME = 'lynda'
@@ -108,50 +113,47 @@ class LyndaIE(LyndaBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- page = self._download_webpage(
+ video = self._download_json(
'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
video_id, 'Downloading video JSON')
- video_json = json.loads(page)
- if 'Status' in video_json:
+ if 'Status' in video:
raise ExtractorError(
- 'lynda returned error: %s' % video_json['Message'], expected=True)
+ 'lynda returned error: %s' % video['Message'], expected=True)
- if video_json['HasAccess'] is False:
+ if video.get('HasAccess') is False:
self.raise_login_required('Video %s is only available for members' % video_id)
- video_id = compat_str(video_json['ID'])
- duration = video_json['DurationInSeconds']
- title = video_json['Title']
+ video_id = compat_str(video.get('ID') or video_id)
+ duration = int_or_none(video.get('DurationInSeconds'))
+ title = video['Title']
formats = []
- fmts = video_json.get('Formats')
+ fmts = video.get('Formats')
if fmts:
- formats.extend([
- {
- 'url': fmt['Url'],
- 'ext': fmt['Extension'],
- 'width': fmt['Width'],
- 'height': fmt['Height'],
- 'filesize': fmt['FileSize'],
- 'format_id': str(fmt['Resolution'])
- } for fmt in fmts])
-
- prioritized_streams = video_json.get('PrioritizedStreams')
+ formats.extend([{
+ 'url': f['Url'],
+ 'ext': f.get('Extension'),
+ 'width': int_or_none(f.get('Width')),
+ 'height': int_or_none(f.get('Height')),
+ 'filesize': int_or_none(f.get('FileSize')),
+ 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None,
+ } for f in fmts if f.get('Url')])
+
+ prioritized_streams = video.get('PrioritizedStreams')
if prioritized_streams:
- formats.extend([
- {
+ for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
+ formats.extend([{
'url': video_url,
'width': int_or_none(format_id),
- 'format_id': format_id,
- } for format_id, video_url in prioritized_streams['0'].items()
- ])
+ 'format_id': '%s-%s' % (prioritized_stream_id, format_id),
+ } for format_id, video_url in prioritized_stream.items()])
self._check_formats(formats, video_id)
self._sort_formats(formats)
- subtitles = self.extract_subtitles(video_id, page)
+ subtitles = self.extract_subtitles(video_id)
return {
'id': video_id,
@@ -182,7 +184,7 @@ class LyndaIE(LyndaBaseIE):
if srt:
return srt
- def _get_subtitles(self, video_id, webpage):
+ def _get_subtitles(self, video_id):
url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
subs = self._download_json(url, None, False)
if subs:
@@ -204,12 +206,13 @@ class LyndaCourseIE(LyndaBaseIE):
course_path = mobj.group('coursepath')
course_id = mobj.group('courseid')
- page = self._download_webpage(
+ course = self._download_json(
'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
course_id, 'Downloading course JSON')
- course_json = json.loads(page)
- if 'Status' in course_json and course_json['Status'] == 'NotFound':
+ self._logout()
+
+ if course.get('Status') == 'NotFound':
raise ExtractorError(
'Course %s does not exist' % course_id, expected=True)
@@ -219,12 +222,13 @@ class LyndaCourseIE(LyndaBaseIE):
# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
# by single video API anymore
- for chapter in course_json['Chapters']:
- for video in chapter['Videos']:
- if video['HasAccess'] is False:
+ for chapter in course['Chapters']:
+ for video in chapter.get('Videos', []):
+ if video.get('HasAccess') is False:
unaccessible_videos += 1
continue
- videos.append(video['ID'])
+ if video.get('ID'):
+ videos.append(video['ID'])
if unaccessible_videos > 0:
self._downloader.report_warning(
@@ -237,6 +241,6 @@ class LyndaCourseIE(LyndaBaseIE):
'Lynda')
for video_id in videos]
- course_title = course_json['Title']
+ course_title = course.get('Title')
return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index fc7499958..88334889e 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -1,64 +1,169 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ xpath_text,
+)
class MDRIE(InfoExtractor):
- _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
+ IE_DESC = 'MDR.DE and KiKA'
+ _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
- # No tests, MDR regularily deletes its videos
- _TEST = {
+ _TESTS = [{
+ # MDR regularily deletes its videos
'url': 'http://www.mdr.de/fakt/video189002.html',
'only_matching': True,
- }
+ }, {
+ # audio
+ 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html',
+ 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa',
+ 'info_dict': {
+ 'id': '1312272',
+ 'ext': 'mp3',
+ 'title': 'Feuilleton vom 30. Oktober 2015',
+ 'duration': 250,
+ 'uploader': 'MITTELDEUTSCHER RUNDFUNK',
+ },
+ }, {
+ 'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
+ 'md5': '4930515e36b06c111213e80d1e4aad0e',
+ 'info_dict': {
+ 'id': '19636',
+ 'ext': 'mp4',
+ 'title': 'Baumhaus vom 30. Oktober 2015',
+ 'duration': 134,
+ 'uploader': 'KIKA',
+ },
+ }, {
+ 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
+ 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
+ 'info_dict': {
+ 'id': '8182',
+ 'ext': 'mp4',
+ 'title': 'Beutolomäus und der geheime Weihnachtswunsch',
+ 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
+ 'timestamp': 1419047100,
+ 'upload_date': '20141220',
+ 'duration': 4628,
+ 'uploader': 'KIKA',
+ },
+ }, {
+ 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('video_id')
- domain = m.group('domain')
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ data_url = self._search_regex(
+ r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
+ webpage, 'data url', group='url')
- # determine title and media streams from webpage
- html = self._download_webpage(url, video_id)
+ doc = self._download_xml(
+ compat_urlparse.urljoin(url, data_url), video_id)
- title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
- xmlurl = self._search_regex(
- r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
+ title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True)
- doc = self._download_xml(domain + xmlurl, video_id)
formats = []
- for a in doc.findall('./assets/asset'):
- url_el = a.find('./progressiveDownloadUrl')
- if url_el is None:
- continue
- abr = int(a.find('bitrateAudio').text) // 1000
- media_type = a.find('mediaType').text
- format = {
- 'abr': abr,
- 'filesize': int(a.find('fileSize').text),
- 'url': url_el.text,
- }
-
- vbr_el = a.find('bitrateVideo')
- if vbr_el is None:
- format.update({
- 'vcodec': 'none',
- 'format_id': '%s-%d' % (media_type, abr),
- })
- else:
- vbr = int(vbr_el.text) // 1000
- format.update({
- 'vbr': vbr,
- 'width': int(a.find('frameWidth').text),
- 'height': int(a.find('frameHeight').text),
- 'format_id': '%s-%d' % (media_type, vbr),
- })
- formats.append(format)
+ processed_urls = []
+ for asset in doc.findall('./assets/asset'):
+ for source in (
+ 'progressiveDownload',
+ 'dynamicHttpStreamingRedirector',
+ 'adaptiveHttpStreamingRedirector'):
+ url_el = asset.find('./%sUrl' % source)
+ if url_el is None:
+ continue
+
+ video_url = url_el.text
+ if video_url in processed_urls:
+ continue
+
+ processed_urls.append(video_url)
+
+ vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
+ abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
+
+ ext = determine_ext(url_el.text)
+ if ext == 'm3u8':
+ url_formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ preference=0, m3u8_id='HLS', fatal=False)
+ elif ext == 'f4m':
+ url_formats = self._extract_f4m_formats(
+ video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
+ preference=0, f4m_id='HDS', fatal=False)
+ else:
+ media_type = xpath_text(asset, './mediaType', 'media type', default='MP4')
+ vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
+ abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
+ filesize = int_or_none(xpath_text(asset, './fileSize', 'file size'))
+
+ f = {
+ 'url': video_url,
+ 'format_id': '%s-%d' % (media_type, vbr or abr),
+ 'filesize': filesize,
+ 'abr': abr,
+ 'preference': 1,
+ }
+
+ if vbr:
+ width = int_or_none(xpath_text(asset, './frameWidth', 'width'))
+ height = int_or_none(xpath_text(asset, './frameHeight', 'height'))
+ f.update({
+ 'vbr': vbr,
+ 'width': width,
+ 'height': height,
+ })
+
+ url_formats = [f]
+
+ if not url_formats:
+ continue
+
+ if not vbr:
+ for f in url_formats:
+ abr = f.get('tbr') or abr
+ if 'tbr' in f:
+ del f['tbr']
+ f.update({
+ 'abr': abr,
+ 'vcodec': 'none',
+ })
+
+ formats.extend(url_formats)
+
self._sort_formats(formats)
+ description = xpath_text(doc, './broadcast/broadcastDescription', 'description')
+ timestamp = parse_iso8601(
+ xpath_text(
+ doc, [
+ './broadcast/broadcastDate',
+ './broadcast/broadcastStartDate',
+ './broadcast/broadcastEndDate'],
+ 'timestamp', default=None))
+ duration = parse_duration(xpath_text(doc, './duration', 'duration'))
+ uploader = xpath_text(doc, './rights', 'uploader')
+
return {
'id': video_id,
'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'uploader': uploader,
'formats': formats,
}
diff --git a/youtube_dl/extractor/megavideoz.py b/youtube_dl/extractor/megavideoz.py
deleted file mode 100644
index af7ff07ea..000000000
--- a/youtube_dl/extractor/megavideoz.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
- xpath_text,
-)
-
-
-class MegaVideozIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P<id>[^/]+)(?:/(?P<display_id>[^/]+))?'
- _TEST = {
- 'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader',
- 'info_dict': {
- 'id': '48723',
- 'display_id': 'SMPTE-Universal-Film-Leader',
- 'ext': 'mp4',
- 'title': 'SMPTE Universal Film Leader',
- 'thumbnail': 're:https?://.*?\.jpg',
- 'duration': 10.93,
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id') or video_id
-
- webpage = self._download_webpage(url, display_id)
-
- if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')):
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
-
- config = self._download_xml(
- self._search_regex(
- r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'),
- display_id)
-
- video_url = xpath_text(config, './file', 'video url', fatal=True)
- title = xpath_text(config, './title', 'title', fatal=True)
- thumbnail = xpath_text(config, './image', 'thumbnail')
- duration = float_or_none(xpath_text(config, './duration', 'duration'))
- video_id = xpath_text(config, './mediaid', 'video id') or video_id
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'url': video_url,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration
- }
diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py
index a784fc5fb..ce391c759 100644
--- a/youtube_dl/extractor/miomio.py
+++ b/youtube_dl/extractor/miomio.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import random
from .common import InfoExtractor
+from ..compat import compat_urllib_request
from ..utils import (
xpath_text,
int_or_none,
@@ -51,6 +52,8 @@ class MioMioIE(InfoExtractor):
mioplayer_path = self._search_regex(
r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path')
+ http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path}
+
xml_config = self._search_regex(
r'flashvars="type=(?:sina|video)&amp;(.+?)&amp;',
webpage, 'xml config')
@@ -60,14 +63,12 @@ class MioMioIE(InfoExtractor):
'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)),
video_id)
- # the following xml contains the actual configuration information on the video file(s)
- vid_config = self._download_xml(
+ vid_config_request = compat_urllib_request.Request(
'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config),
- video_id)
+ headers=http_headers)
- http_headers = {
- 'Referer': 'http://www.miomio.tv%s' % mioplayer_path,
- }
+ # the following xml contains the actual configuration information on the video file(s)
+ vid_config = self._download_xml(vid_config_request, video_id)
if not int_or_none(xpath_text(vid_config, 'timelength')):
raise ExtractorError('Unable to load videos!', expected=True)
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index f088ab9e2..29ca45778 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -86,7 +86,7 @@ class MITIE(TechTVMITIE):
webpage = self._download_webpage(url, page_title)
embed_url = self._search_regex(
r'<iframe .*?src="(.+?)"', webpage, 'embed url')
- return self.url_result(embed_url, ie='TechTVMIT')
+ return self.url_result(embed_url)
class OCWMITIE(InfoExtractor):
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 852d72266..c595f2077 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -1,74 +1,89 @@
from __future__ import unicode_literals
-import json
-
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
- compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import (
+ encode_dict,
get_element_by_attribute,
- parse_duration,
- strip_jsonp,
+ int_or_none,
)
class MiTeleIE(InfoExtractor):
- IE_NAME = 'mitele.es'
+ IE_DESC = 'mitele.es'
_VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
+ 'md5': '0ff1a13aebb35d9bc14081ff633dd324',
'info_dict': {
- 'id': '0fce117d',
- 'ext': 'mp4',
- 'title': 'Programa 144 - Tor, la web invisible',
- 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+ 'id': '0NF1jJnxS1Wu3pHrmvFyw2',
'display_id': 'programa-144',
+ 'ext': 'flv',
+ 'title': 'Tor, la web invisible',
+ 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
'duration': 2913,
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
}]
def _real_extract(self, url):
- episode = self._match_id(url)
- webpage = self._download_webpage(url, episode)
- embed_data_json = self._search_regex(
- r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
- ).replace('\'', '"')
- embed_data = json.loads(embed_data_json)
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ config_url = self._search_regex(
+ r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
+ config_url = compat_urlparse.urljoin(url, config_url)
- domain = embed_data['mediaUrl']
- if not domain.startswith('http'):
- # only happens in telecinco.es videos
- domain = 'http://' + domain
- info_url = compat_urlparse.urljoin(
- domain,
- compat_urllib_parse_unquote(embed_data['flashvars']['host'])
- )
- info_el = self._download_xml(info_url, episode).find('./video/info')
+ config = self._download_json(
+ config_url, display_id, 'Downloading config JSON')
- video_link = info_el.find('videoUrl/link').text
- token_query = compat_urllib_parse.urlencode({'id': video_link})
- token_info = self._download_json(
- embed_data['flashvars']['ov_tk'] + '?' + token_query,
- episode,
- transform_source=strip_jsonp
- )
- formats = self._extract_m3u8_formats(
- token_info['tokenizedUrl'], episode, ext='mp4')
+ mmc = self._download_json(
+ config['services']['mmc'], display_id, 'Downloading mmc JSON')
+
+ formats = []
+ for location in mmc['locations']:
+ gat = self._proto_relative_url(location.get('gat'), 'http:')
+ bas = location.get('bas')
+ loc = location.get('loc')
+ ogn = location.get('ogn')
+ if None in (gat, bas, loc, ogn):
+ continue
+ token_data = {
+ 'bas': bas,
+ 'icd': loc,
+ 'ogn': ogn,
+ 'sta': '0',
+ }
+ media = self._download_json(
+ '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))),
+ display_id, 'Downloading %s JSON' % location['loc'])
+ file_ = media.get('file')
+ if not file_:
+ continue
+ formats.extend(self._extract_f4m_formats(
+ file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+ display_id, f4m_id=loc))
+
+ title = self._search_regex(
+ r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title')
+
+ video_id = self._search_regex(
+ r'data-media-id\s*=\s*"([^"]+)"', webpage,
+ 'data media id', default=None) or display_id
+ thumbnail = config.get('poster', {}).get('imageUrl')
+ duration = int_or_none(mmc.get('duration'))
return {
- 'id': embed_data['videoId'],
- 'display_id': episode,
- 'title': info_el.find('title').text,
- 'formats': formats,
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
'description': get_element_by_attribute('class', 'text', webpage),
- 'thumbnail': info_el.find('thumb').text,
- 'duration': parse_duration(info_el.find('duration').text),
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py
index 69e4bcd1a..7c0c4e50e 100644
--- a/youtube_dl/extractor/moniker.py
+++ b/youtube_dl/extractor/moniker.py
@@ -17,7 +17,7 @@ from ..utils import (
class MonikerIE(InfoExtractor):
IE_DESC = 'allmyvideos.net and vidspot.net'
- _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P<id>[a-zA-Z0-9_-]+)'
_TESTS = [{
'url': 'http://allmyvideos.net/jih3nce3x6wn',
@@ -46,6 +46,18 @@ class MonikerIE(InfoExtractor):
}, {
'url': 'https://www.vidspot.net/l2ngsmhs8ci5',
'only_matching': True,
+ }, {
+ 'url': 'http://vidspot.net/2/v-ywDf99',
+ 'md5': '5f8254ce12df30479428b0152fb8e7ba',
+ 'info_dict': {
+ 'id': 'ywDf99',
+ 'ext': 'mp4',
+ 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)',
+ 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.',
+ },
+ }, {
+ 'url': 'http://allmyvideos.net/v/v-HXZm5t',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -64,18 +76,30 @@ class MonikerIE(InfoExtractor):
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, error), expected=True)
- fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
- data = dict(fields)
+ builtin_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+?/builtin-.+?)\1',
+ orig_webpage, 'builtin URL', default=None, group='url')
- post = compat_urllib_parse.urlencode(data)
- headers = {
- b'Content-Type': b'application/x-www-form-urlencoded',
- }
- req = compat_urllib_request.Request(url, post, headers)
- webpage = self._download_webpage(
- req, video_id, note='Downloading video page ...')
+ if builtin_url:
+ req = compat_urllib_request.Request(builtin_url)
+ req.add_header('Referer', url)
+ webpage = self._download_webpage(req, video_id, 'Downloading builtin page')
+ title = self._og_search_title(orig_webpage).strip()
+ description = self._og_search_description(orig_webpage).strip()
+ else:
+ fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
+ data = dict(fields)
+
+ post = compat_urllib_parse.urlencode(data)
+ headers = {
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ }
+ req = compat_urllib_request.Request(url, post, headers)
+ webpage = self._download_webpage(
+ req, video_id, note='Downloading video page ...')
- title = os.path.splitext(data['fname'])[0]
+ title = os.path.splitext(data['fname'])[0]
+ description = None
# Could be several links with different quality
links = re.findall(r'"file" : "?(.+?)",', webpage)
@@ -89,5 +113,6 @@ class MonikerIE(InfoExtractor):
return {
'id': video_id,
'title': title,
+ 'description': description,
'formats': formats,
}
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py
index 04e17d055..b8c43a163 100644
--- a/youtube_dl/extractor/movieclips.py
+++ b/youtube_dl/extractor/movieclips.py
@@ -1,80 +1,42 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import (
- compat_str,
-)
-from ..utils import (
- ExtractorError,
- clean_html,
+ compat_urllib_request,
)
class MovieClipsIE(InfoExtractor):
- _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?'
+ _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)'
_TEST = {
- 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/',
+ 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5',
'info_dict': {
- 'id': 'Wy7ZU',
- 'display_id': 'my-week-with-marilyn-movie-do-you-love-me',
+ 'id': 'pKIGmG83AqD9',
+ 'display_id': 'warcraft-trailer-1-561180739597',
'ext': 'mp4',
- 'title': 'My Week with Marilyn - Do You Love Me?',
- 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb',
+ 'title': 'Warcraft Trailer 1',
+ 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.',
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
+ 'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id')
- show_id = display_id or video_id
-
- config = self._download_xml(
- 'http://config.movieclips.com/player/config/%s' % video_id,
- show_id, 'Downloading player config')
-
- if config.find('./country-region').text == 'false':
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True)
-
- properties = config.find('./video/properties')
- smil_file = properties.attrib['smil_file']
+ display_id = self._match_id(url)
- smil = self._download_xml(smil_file, show_id, 'Downloading SMIL')
- base_url = smil.find('./head/meta').attrib['base']
-
- formats = []
- for video in smil.findall('./body/switch/video'):
- vbr = int(video.attrib['system-bitrate']) / 1000
- src = video.attrib['src']
- formats.append({
- 'url': base_url,
- 'play_path': src,
- 'ext': src.split(':')[0],
- 'vbr': vbr,
- 'format_id': '%dk' % vbr,
- })
-
- self._sort_formats(formats)
-
- title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title'])
- description = clean_html(compat_str(properties.attrib['clip_description']))
- thumbnail = properties.attrib['image']
- categories = properties.attrib['clip_categories'].split(',')
+ req = compat_urllib_request.Request(url)
+ # it doesn't work if it thinks the browser it's too old
+ req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)')
+ webpage = self._download_webpage(req, display_id)
+ theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link')
+ title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com</title>', webpage, 'title')
+ description = self._html_search_meta('description', webpage)
return {
- 'id': video_id,
- 'display_id': display_id,
+ '_type': 'url_transparent',
+ 'url': theplatform_link,
'title': title,
+ 'display_id': display_id,
'description': description,
- 'thumbnail': thumbnail,
- 'categories': categories,
- 'formats': formats,
}
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index a597714e9..302c9bf35 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -200,7 +200,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
if mgid is None or ':' not in mgid:
mgid = self._search_regex(
[r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],
- webpage, 'mgid')
+ webpage, 'mgid', default=None)
+
+ if not mgid:
+ sm4_embed = self._html_search_meta(
+ 'sm4:video:embed', webpage, 'sm4 embed', default='')
+ mgid = self._search_regex(
+ r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid')
videos_info = self._get_videos_info(mgid)
return videos_info
@@ -222,6 +228,13 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
},
}
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
def _get_feed_url(self, uri):
video_id = self._id_from_uri(uri)
site_id = uri.replace(video_id, '')
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 925967753..1f5fc2145 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -10,7 +10,6 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
- clean_html,
)
@@ -46,11 +45,11 @@ class NaverIE(InfoExtractor):
m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
webpage)
if m_id is None:
- m_error = re.search(
- r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
- webpage)
- if m_error:
- raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
+ error = self._html_search_regex(
+ r'(?s)<div class="(?:nation_error|nation_box|error_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
raise ExtractorError('couldn\'t extract vid and key')
vid = m_id.group(1)
key = m_id.group(2)
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index 79a13958b..16213eed9 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -1,130 +1,387 @@
-# encoding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
+ determine_ext,
int_or_none,
+ parse_iso8601,
qualities,
- parse_duration,
)
class NDRBaseIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ display_id = next(group for group in mobj.groups() if group)
+ webpage = self._download_webpage(url, display_id)
+ return self._extract_embed(webpage, display_id)
- page = self._download_webpage(url, video_id, 'Downloading page')
- title = self._og_search_title(page).strip()
- description = self._og_search_description(page)
- if description:
- description = description.strip()
+class NDRIE(NDRBaseIE):
+ IE_NAME = 'ndr'
+ IE_DESC = 'NDR.de - Norddeutscher Rundfunk'
+ _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html'
+ _TESTS = [{
+ # httpVideo, same content id
+ 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
+ 'md5': '6515bc255dc5c5f8c85bbc38e035a659',
+ 'info_dict': {
+ 'id': 'hafengeburtstag988',
+ 'display_id': 'Party-Poette-und-Parade',
+ 'ext': 'mp4',
+ 'title': 'Party, Pötte und Parade',
+ 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c',
+ 'uploader': 'ndrtv',
+ 'timestamp': 1431108900,
+ 'upload_date': '20150510',
+ 'duration': 3498,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpVideo, different content id
+ 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html',
+ 'md5': '1043ff203eab307f0c51702ec49e9a71',
+ 'info_dict': {
+ 'id': 'osna272',
+ 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch',
+ 'ext': 'mp4',
+ 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights',
+ 'description': 'md5:32e9b800b3d2d4008103752682d5dc01',
+ 'uploader': 'ndrtv',
+ 'timestamp': 1442059200,
+ 'upload_date': '20150912',
+ 'duration': 510,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpAudio, same content id
+ 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html',
+ 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+ 'info_dict': {
+ 'id': 'audio51535',
+ 'display_id': 'La-Valette-entgeht-der-Hinrichtung',
+ 'ext': 'mp3',
+ 'title': 'La Valette entgeht der Hinrichtung',
+ 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
+ 'uploader': 'ndrinfo',
+ 'timestamp': 1290626100,
+ 'upload_date': '20140729',
+ 'duration': 884,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
+ 'only_matching': True,
+ }]
+
+ def _extract_embed(self, webpage, display_id):
+ embed_url = self._html_search_meta(
+ 'embedURL', webpage, 'embed URL', fatal=True)
+ description = self._search_regex(
+ r'<p[^>]+itemprop="description">([^<]+)</p>',
+ webpage, 'description', fatal=False)
+ timestamp = parse_iso8601(
+ self._search_regex(
+ r'<span itemprop="datePublished" content="([^"]+)">',
+ webpage, 'upload date', fatal=False))
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ 'display_id': display_id,
+ 'description': description,
+ 'timestamp': timestamp,
+ }
- duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None))
- if not duration:
- duration = parse_duration(self._html_search_regex(
- r'(<span class="min">\d+</span>:<span class="sec">\d+</span>)',
- page, 'duration', default=None))
- formats = []
+class NJoyIE(NDRBaseIE):
+ IE_NAME = 'njoy'
+ IE_DESC = 'N-JOY'
+ _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html'
+ _TESTS = [{
+ # httpVideo, same content id
+ 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
+ 'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+ 'info_dict': {
+ 'id': 'comedycontest2480',
+ 'display_id': 'Benaissa-beim-NDR-Comedy-Contest',
+ 'ext': 'mp4',
+ 'title': 'Benaissa beim NDR Comedy Contest',
+ 'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39',
+ 'uploader': 'ndrtv',
+ 'upload_date': '20141129',
+ 'duration': 654,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpVideo, different content id
+ 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html',
+ 'md5': '417660fffa90e6df2fda19f1b40a64d8',
+ 'info_dict': {
+ 'id': 'dockville882',
+ 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-',
+ 'ext': 'mp4',
+ 'title': '"Ich hab noch nie" mit Felix Jaehn',
+ 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3',
+ 'uploader': 'njoy',
+ 'upload_date': '20150822',
+ 'duration': 211,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html',
+ 'only_matching': True,
+ }]
+
+ def _extract_embed(self, webpage, display_id):
+ video_id = self._search_regex(
+ r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id')
+ description = self._search_regex(
+ r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>',
+ webpage, 'description', fatal=False)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'NDREmbedBase',
+ 'url': 'ndr:%s' % video_id,
+ 'display_id': display_id,
+ 'description': description,
+ }
+
+
+class NDREmbedBaseIE(InfoExtractor):
+ IE_NAME = 'ndr:embed:base'
+ _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)'
+ _TESTS = [{
+ 'url': 'ndr:soundcheck3366',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/soundcheck3366-ppjson.json',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('id_s')
+
+ ppjson = self._download_json(
+ 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id)
- mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
- if mp3_url:
- formats.append({
- 'url': mp3_url.group('audio'),
- 'format_id': 'mp3',
- })
+ playlist = ppjson['playlist']
- thumbnail = None
+ formats = []
+ quality_key = qualities(('xs', 's', 'm', 'l', 'xl'))
- video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page)
- if video_url:
- thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
- if thumbnails:
- quality_key = qualities(['xs', 's', 'm', 'l', 'xl'])
- largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1]))
- thumbnail = 'http://www.ndr.de' + largest[0]
+ for format_id, f in playlist.items():
+ src = f.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, None)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, m3u8_id='hls', entry_protocol='m3u8_native'))
+ else:
+ quality = f.get('quality')
+ ff = {
+ 'url': src,
+ 'format_id': quality or format_id,
+ 'quality': quality_key(quality),
+ }
+ type_ = f.get('type')
+ if type_ and type_.split('/')[0] == 'audio':
+ ff['vcodec'] = 'none'
+ ff['ext'] = ext or 'mp3'
+ formats.append(ff)
+ self._sort_formats(formats)
- for format_id in 'lo', 'hi', 'hq':
- formats.append({
- 'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
- 'format_id': format_id,
- })
+ config = playlist['config']
- if not formats:
- raise ExtractorError('No media links available for %s' % video_id)
+ live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive']
+ title = config['title']
+ if live:
+ title = self._live_title(title)
+ uploader = ppjson.get('config', {}).get('branding')
+ upload_date = ppjson.get('config', {}).get('publicationDate')
+ duration = int_or_none(config.get('duration'))
+
+ thumbnails = [{
+ 'id': thumbnail.get('quality') or thumbnail_id,
+ 'url': thumbnail['src'],
+ 'preference': quality_key(thumbnail.get('quality')),
+ } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')]
return {
'id': video_id,
'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
+ 'is_live': live,
+ 'uploader': uploader if uploader != '-' else None,
+ 'upload_date': upload_date[0:8] if upload_date else None,
'duration': duration,
+ 'thumbnails': thumbnails,
'formats': formats,
}
-class NDRIE(NDRBaseIE):
- IE_NAME = 'ndr'
- IE_DESC = 'NDR.de - Mediathek'
- _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
-
- _TESTS = [
- {
- 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
- 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
- 'note': 'Video file',
- 'info_dict': {
- 'id': '25866',
- 'ext': 'mp4',
- 'title': 'Kartoffeltage in der Lewitz',
- 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
- 'duration': 166,
- },
- 'skip': '404 Not found',
- },
- {
- 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
- 'md5': 'dadc003c55ae12a5d2f6bd436cd73f59',
- 'info_dict': {
- 'id': '988',
- 'ext': 'mp4',
- 'title': 'Party, Pötte und Parade',
- 'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.',
- 'duration': 3498,
- },
- },
- {
- 'url': 'http://www.ndr.de/info/audio51535.html',
- 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
- 'note': 'Audio file',
- 'info_dict': {
- 'id': '51535',
- 'ext': 'mp3',
- 'title': 'La Valette entgeht der Hinrichtung',
- 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
- 'duration': 884,
- }
- }
- ]
-
+class NDREmbedIE(NDREmbedBaseIE):
+ IE_NAME = 'ndr:embed'
+ _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html'
+ _TESTS = [{
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html',
+ 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9',
+ 'info_dict': {
+ 'id': 'ndraktuell28488',
+ 'ext': 'mp4',
+ 'title': 'Norddeutschland begrüßt Flüchtlinge',
+ 'is_live': False,
+ 'uploader': 'ndrtv',
+ 'upload_date': '20150907',
+ 'duration': 132,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html',
+ 'md5': '002085c44bae38802d94ae5802a36e78',
+ 'info_dict': {
+ 'id': 'soundcheck3366',
+ 'ext': 'mp4',
+ 'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen',
+ 'is_live': False,
+ 'uploader': 'ndr2',
+ 'upload_date': '20150912',
+ 'duration': 3554,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/info/audio51535-player.html',
+ 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+ 'info_dict': {
+ 'id': 'audio51535',
+ 'ext': 'mp3',
+ 'title': 'La Valette entgeht der Hinrichtung',
+ 'is_live': False,
+ 'uploader': 'ndrinfo',
+ 'upload_date': '20140729',
+ 'duration': 884,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html',
+ 'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c',
+ 'info_dict': {
+ 'id': 'visite11010',
+ 'ext': 'mp4',
+ 'title': 'Visite - die ganze Sendung',
+ 'is_live': False,
+ 'uploader': 'ndrtv',
+ 'upload_date': '20150902',
+ 'duration': 3525,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpVideoLive
+ 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html',
+ 'info_dict': {
+ 'id': 'livestream217',
+ 'ext': 'flv',
+ 'title': 're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'is_live': True,
+ 'upload_date': '20150910',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/doku952-player.html',
+ 'only_matching': True,
+ }]
-class NJoyIE(NDRBaseIE):
- IE_NAME = 'N-JOY'
- _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P<id>\d+)\.html'
- _TEST = {
- 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
- 'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+class NJoyEmbedIE(NDREmbedBaseIE):
+ IE_NAME = 'njoy:embed'
+ _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html'
+ _TESTS = [{
+ # httpVideo
+ 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html',
+ 'md5': '8483cbfe2320bd4d28a349d62d88bd74',
'info_dict': {
- 'id': '2480',
+ 'id': 'doku948',
'ext': 'mp4',
- 'title': 'Benaissa beim NDR Comedy Contest',
- 'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.',
- 'duration': 654,
- }
- }
+ 'title': 'Zehn Jahre Reeperbahn Festival - die Doku',
+ 'is_live': False,
+ 'upload_date': '20150807',
+ 'duration': 1011,
+ },
+ }, {
+ # httpAudio
+ 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html',
+ 'md5': 'd989f80f28ac954430f7b8a48197188a',
+ 'info_dict': {
+ 'id': 'stefanrichter100',
+ 'ext': 'mp3',
+ 'title': 'Interview mit einem Augenzeugen',
+ 'is_live': False,
+ 'uploader': 'njoy',
+ 'upload_date': '20150909',
+ 'duration': 140,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpAudioLive, no explicit ext
+ 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html',
+ 'info_dict': {
+ 'id': 'webradioweltweit100',
+ 'ext': 'mp3',
+ 'title': 're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'is_live': True,
+ 'uploader': 'njoy',
+ 'upload_date': '20150810',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py
index c10784f6b..d1688457f 100644
--- a/youtube_dl/extractor/nextmedia.py
+++ b/youtube_dl/extractor/nextmedia.py
@@ -126,7 +126,8 @@ class AppleDailyIE(NextMediaIE):
'thumbnail': 're:^https?://.*\.jpg$',
'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd',
'upload_date': '20150128',
- }
+ },
+ 'skip': 'redirect to http://www.appledaily.com.tw/animation/',
}, {
# No thumbnail
'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/',
@@ -140,10 +141,19 @@ class AppleDailyIE(NextMediaIE):
},
'expected_warnings': [
'video thumbnail',
- ]
+ ],
+ 'skip': 'redirect to http://www.appledaily.com.tw/animation/',
}, {
'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
- 'only_matching': True,
+ 'md5': 'eaa20e6b9df418c912d7f5dec2ba734d',
+ 'info_dict': {
+ 'id': '35770334',
+ 'ext': 'mp4',
+ 'title': '咖啡占卜測 XU裝熟指數',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748',
+ 'upload_date': '20140417',
+ },
}]
_URL_PATTERN = r'\{url: \'(.+)\'\}'
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index dc54634a5..200874d68 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -16,53 +16,118 @@ from ..utils import (
class NFLIE(InfoExtractor):
IE_NAME = 'nfl.com'
- _VALID_URL = r'''(?x)https?://
- (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
- (?:.+?/)*
- (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
- _TESTS = [
- {
- 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
- 'md5': '394ef771ddcd1354f665b471d78ec4c6',
- 'info_dict': {
- 'id': '0ap3000000398478',
- 'ext': 'mp4',
- 'title': 'Week 3: Redskins vs. Eagles highlights',
- 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
- 'upload_date': '20140921',
- 'timestamp': 1411337580,
- 'thumbnail': 're:^https?://.*\.jpg$',
- }
+ _VALID_URL = r'''(?x)
+ https?://
+ (?P<host>
+ (?:www\.)?
+ (?:
+ (?:
+ nfl|
+ buffalobills|
+ miamidolphins|
+ patriots|
+ newyorkjets|
+ baltimoreravens|
+ bengals|
+ clevelandbrowns|
+ steelers|
+ houstontexans|
+ colts|
+ jaguars|
+ titansonline|
+ denverbroncos|
+ kcchiefs|
+ raiders|
+ chargers|
+ dallascowboys|
+ giants|
+ philadelphiaeagles|
+ redskins|
+ chicagobears|
+ detroitlions|
+ packers|
+ vikings|
+ atlantafalcons|
+ panthers|
+ neworleanssaints|
+ buccaneers|
+ azcardinals|
+ stlouisrams|
+ 49ers|
+ seahawks
+ )\.com|
+ .+?\.clubs\.nfl\.com
+ )
+ )/
+ (?:.+?/)*
+ (?P<id>[^/#?&]+)
+ '''
+ _TESTS = [{
+ 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+ 'md5': '394ef771ddcd1354f665b471d78ec4c6',
+ 'info_dict': {
+ 'id': '0ap3000000398478',
+ 'ext': 'mp4',
+ 'title': 'Week 3: Redskins vs. Eagles highlights',
+ 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+ 'upload_date': '20140921',
+ 'timestamp': 1411337580,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+ 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+ 'info_dict': {
+ 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+ 'ext': 'mp4',
+ 'title': 'LIVE: Post Game vs. Browns',
+ 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+ 'upload_date': '20131229',
+ 'timestamp': 1388354455,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
+ 'info_dict': {
+ 'id': '0ap3000000467607',
+ 'ext': 'mp4',
+ 'title': 'Frustrations flare on the field',
+ 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
+ 'timestamp': 1422850320,
+ 'upload_date': '20150202',
},
- {
- 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
- 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
- 'info_dict': {
- 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
- 'ext': 'mp4',
- 'title': 'LIVE: Post Game vs. Browns',
- 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
- 'upload_date': '20131229',
- 'timestamp': 1388354455,
- 'thumbnail': 're:^https?://.*\.jpg$',
- }
+ }, {
+ 'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette',
+ 'md5': '4c319e2f625ffd0b481b4382c6fc124c',
+ 'info_dict': {
+ 'id': 'n-238346',
+ 'ext': 'mp4',
+ 'title': '10 Days at Gillette',
+ 'description': 'md5:8cd9cd48fac16de596eadc0b24add951',
+ 'timestamp': 1442618809,
+ 'upload_date': '20150918',
},
- {
- 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
- 'info_dict': {
- 'id': '0ap3000000467607',
- 'ext': 'mp4',
- 'title': 'Frustrations flare on the field',
- 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
- 'timestamp': 1422850320,
- 'upload_date': '20150202',
- },
+ }, {
+ # lowercase data-contentid
+ 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',
+ 'info_dict': {
+ 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',
+ 'ext': 'mp4',
+ 'title': 'Tomlin looks ahead to Ravens on a short week',
+ 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',
+ 'timestamp': 1443459651,
+ 'upload_date': '20150928',
},
- {
- 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
- 'only_matching': True,
- }
- ]
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a',
+ 'only_matching': True,
+ }]
@staticmethod
def prepend_host(host, url):
@@ -95,13 +160,14 @@ class NFLIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config_url = NFLIE.prepend_host(host, self._search_regex(
- r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL',
- default='static/content/static/config/video/config.json'))
+ r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1',
+ webpage, 'config URL', default='static/content/static/config/video/config.json',
+ group='config'))
# For articles, the id in the url is not the video id
video_id = self._search_regex(
- r'contentId\s*:\s*"([^"]+)"', webpage, 'video id', default=video_id)
- config = self._download_json(config_url, video_id,
- note='Downloading player config')
+ r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>.+?)\1',
+ webpage, 'video id', default=video_id, group='id')
+ config = self._download_json(config_url, video_id, 'Downloading player config')
url_template = NFLIE.prepend_host(
host, '{contentURLTemplate:}'.format(**config))
video_data = self._download_json(
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 279b18386..e98a5ef89 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -72,7 +72,7 @@ class NHLBaseInfoExtractor(InfoExtractor):
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)(?:id|hlg)=(?P<id>[-0-9a-zA-Z,]+)'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P<id>[-0-9a-zA-Z,]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
@@ -136,6 +136,9 @@ class NHLIE(NHLBaseInfoExtractor):
'params': {
'skip_download': True, # Requires rtmpdump
}
+ }, {
+ 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -146,9 +149,9 @@ class NHLIE(NHLBaseInfoExtractor):
class NHLNewsIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com:news'
IE_DESC = 'NHL news'
- _VALID_URL = r'https?://(?:www\.)?nhl\.com/ice/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
+ _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.nhl.com/ice/news.htm?id=750727',
'md5': '4b3d1262e177687a3009937bd9ec0be8',
'info_dict': {
@@ -159,13 +162,26 @@ class NHLNewsIE(NHLBaseInfoExtractor):
'duration': 37,
'upload_date': '20150128',
},
- }
+ }, {
+ # iframe embed
+ 'url': 'http://sabres.nhl.com/club/news.htm?id=780189',
+ 'md5': '9f663d1c006c90ac9fb82777d4294e12',
+ 'info_dict': {
+ 'id': '836127',
+ 'ext': 'mp4',
+ 'title': 'Morning Skate: OTT vs. BUF (9/23/15)',
+ 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.",
+ 'duration': 93,
+ 'upload_date': '20150923',
+ },
+ }]
def _real_extract(self, url):
news_id = self._match_id(url)
webpage = self._download_webpage(url, news_id)
video_id = self._search_regex(
- [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'"],
+ [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'",
+ r'<iframe[^>]+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'],
webpage, 'video id')
return self._real_extract_video(video_id)
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index 7f842b5c2..a06d38afd 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import str_to_int
@@ -9,61 +8,93 @@ from ..utils import str_to_int
class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
- _VALID_URL = r'''(?x)^https?://(?:www\.)?9gag\.tv/
- (?:
- v/(?P<numid>[0-9]+)|
- p/(?P<id>[a-zA-Z0-9]+)/(?P<display_id>[^?#/]+)
- )
- '''
+ _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'
_TESTS = [{
- "url": "http://9gag.tv/v/1912",
- "info_dict": {
- "id": "1912",
- "ext": "mp4",
- "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
- "title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
+ 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome',
+ 'info_dict': {
+ 'id': 'Kk2X5',
+ 'ext': 'mp4',
+ 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)',
+ 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome',
'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
'uploader': 'CompilationChannel',
'upload_date': '20131110',
- "view_count": int,
- "thumbnail": "re:^https?://",
+ 'view_count': int,
},
- 'add_ie': ['Youtube']
+ 'add_ie': ['Youtube'],
}, {
- 'url': 'http://9gag.tv/p/KklwM/alternate-banned-opening-scene-of-gravity?ref=fsidebar',
+ 'url': 'http://9gag.com/tv/p/aKolP3',
'info_dict': {
- 'id': 'KklwM',
+ 'id': 'aKolP3',
'ext': 'mp4',
- 'display_id': 'alternate-banned-opening-scene-of-gravity',
- "description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.",
- 'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie",
- 'uploader': 'Krishna Shenoi',
- 'upload_date': '20140401',
- 'uploader_id': 'krishnashenoi93',
+ 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video',
+ 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!",
+ 'uploader_id': 'rickmereki',
+ 'uploader': 'Rick Mereki',
+ 'upload_date': '20110803',
+ 'view_count': int,
},
+ 'add_ie': ['Vimeo'],
+ }, {
+ 'url': 'http://9gag.com/tv/p/KklwM',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://9gag.tv/p/Kk2X5',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://9gag.com/tv/embed/a5Dmvl',
+ 'only_matching': True,
}]
+ _EXTERNAL_VIDEO_PROVIDER = {
+ '1': {
+ 'url': '%s',
+ 'ie_key': 'Youtube',
+ },
+ '2': {
+ 'url': 'http://player.vimeo.com/video/%s',
+ 'ie_key': 'Vimeo',
+ },
+ '3': {
+ 'url': 'http://instagram.com/p/%s',
+ 'ie_key': 'Instagram',
+ },
+ '4': {
+ 'url': 'http://vine.co/v/%s',
+ 'ie_key': 'Vine',
+ },
+ }
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('numid') or mobj.group('id')
+ video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
- post_view = json.loads(self._html_search_regex(
- r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view'))
+ post_view = self._parse_json(
+ self._search_regex(
+ r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost',
+ webpage, 'post view'),
+ display_id)
- youtube_id = post_view['videoExternalId']
+ ie_key = None
+ source_url = post_view.get('sourceUrl')
+ if not source_url:
+ external_video_id = post_view['videoExternalId']
+ external_video_provider = post_view['videoExternalProvider']
+ source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id
+ ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']
title = post_view['title']
- description = post_view['description']
- view_count = str_to_int(post_view['externalView'])
+ description = post_view.get('description')
+ view_count = str_to_int(post_view.get('externalView'))
thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
return {
'_type': 'url_transparent',
- 'url': youtube_id,
- 'ie_key': 'Youtube',
+ 'url': source_url,
+ 'ie_key': ie_key,
'id': video_id,
'display_id': display_id,
'title': title,
diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py
index 04d779890..6b15fc2e5 100644
--- a/youtube_dl/extractor/novamov.py
+++ b/youtube_dl/extractor/novamov.py
@@ -4,10 +4,14 @@ import re
from .common import InfoExtractor
from ..compat import (
+ compat_urllib_request,
compat_urlparse,
)
from ..utils import (
ExtractorError,
+ NO_DEFAULT,
+ encode_dict,
+ urlencode_postdata,
)
@@ -38,19 +42,40 @@ class NovaMovIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- page = self._download_webpage(
- 'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
+ url = 'http://%s/video/%s' % (self._HOST, video_id)
- if re.search(self._FILE_DELETED_REGEX, page) is not None:
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ webpage = self._download_webpage(
+ url, video_id, 'Downloading video page')
- filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
+ if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
- description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
+ def extract_filekey(default=NO_DEFAULT):
+ return self._search_regex(
+ self._FILEKEY_REGEX, webpage, 'filekey', default=default)
+
+ filekey = extract_filekey(default=None)
+
+ if not filekey:
+ fields = self._hidden_inputs(webpage)
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', webpage,
+ 'post url', default=url, group='url')
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(url, post_url)
+ request = compat_urllib_request.Request(
+ post_url, urlencode_postdata(encode_dict(fields)))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ request.add_header('Referer', post_url)
+ webpage = self._download_webpage(
+ request, video_id, 'Downloading continue to the video page')
+
+ filekey = extract_filekey()
+
+ title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title', fatal=False)
+ description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False)
api_response = self._download_webpage(
'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index 6b2f3f55a..b97f62fdb 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -1,64 +1,134 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .brightcove import BrightcoveIE
from .common import InfoExtractor
from ..utils import ExtractorError
+from ..compat import (
+ compat_str,
+ compat_urllib_request,
+)
+
+
+class NownessBaseIE(InfoExtractor):
+ def _extract_url_result(self, post):
+ if post['type'] == 'video':
+ for media in post['media']:
+ if media['type'] == 'video':
+ video_id = media['content']
+ source = media['source']
+ if source == 'brightcove':
+ player_code = self._download_webpage(
+ 'http://www.nowness.com/iframe?id=%s' % video_id, video_id,
+ note='Downloading player JavaScript',
+ errnote='Unable to download player JavaScript')
+ bc_url = BrightcoveIE._extract_brightcove_url(player_code)
+ if bc_url is None:
+ raise ExtractorError('Could not find player definition')
+ return self.url_result(bc_url, 'Brightcove')
+ elif source == 'vimeo':
+ return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
+ elif source == 'youtube':
+ return self.url_result(video_id, 'Youtube')
+ elif source == 'cinematique':
+ # youtube-dl currently doesn't support cinematique
+ # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique')
+ pass
+ def _api_request(self, url, request_path):
+ display_id = self._match_id(url)
+ request = compat_urllib_request.Request(
+ 'http://api.nowness.com/api/' + request_path % display_id,
+ headers={
+ 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us',
+ })
+ return display_id, self._download_json(request, display_id)
-class NownessIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])'
- _TESTS = [
- {
- 'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation',
- 'md5': '068bc0202558c2e391924cb8cc470676',
- 'info_dict': {
- 'id': '2520295746001',
- 'ext': 'mp4',
- 'title': 'Candor: The Art of Gesticulation',
- 'description': 'Candor: The Art of Gesticulation',
- 'thumbnail': 're:^https?://.*\.jpg',
- 'uploader': 'Nowness',
- }
+class NownessIE(NownessBaseIE):
+ IE_NAME = 'nowness'
+ _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P<id>[^/]+?)(?:$|[?#])'
+ _TESTS = [{
+ 'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation',
+ 'md5': '068bc0202558c2e391924cb8cc470676',
+ 'info_dict': {
+ 'id': '2520295746001',
+ 'ext': 'mp4',
+ 'title': 'Candor: The Art of Gesticulation',
+ 'description': 'Candor: The Art of Gesticulation',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Nowness',
},
- {
- 'url': 'http://cn.nowness.com/day/2014/8/7/4069/kasper-bj-rke-ft-jaakko-eino-kalevi--tnr',
- 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3',
- 'info_dict': {
- 'id': '3716354522001',
- 'ext': 'mp4',
- 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
- 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
- 'thumbnail': 're:^https?://.*\.jpg',
- 'uploader': 'Nowness',
- }
+ }, {
+ 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr',
+ 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3',
+ 'info_dict': {
+ 'id': '3716354522001',
+ 'ext': 'mp4',
+ 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
+ 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Nowness',
},
- ]
+ }, {
+ # vimeo
+ 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut',
+ 'md5': '9a5a6a8edf806407e411296ab6bc2a49',
+ 'info_dict': {
+ 'id': '130020913',
+ 'ext': 'mp4',
+ 'title': 'Bleu, Blanc, Rouge - A Godard Supercut',
+ 'description': 'md5:f0ea5f1857dffca02dbd37875d742cec',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20150607',
+ 'uploader': 'Cinema Sem Lei',
+ 'uploader_id': 'cinemasemlei',
+ },
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('slug')
+ _, post = self._api_request(url, 'post/getBySlug/%s')
+ return self._extract_url_result(post)
- webpage = self._download_webpage(url, video_id)
- player_url = self._search_regex(
- r'"([^"]+/content/issue-[0-9.]+.js)"', webpage, 'player URL')
- real_id = self._search_regex(
- r'\sdata-videoId="([0-9]+)"', webpage, 'internal video ID')
- player_code = self._download_webpage(
- player_url, video_id,
- note='Downloading player JavaScript',
- errnote='Player download failed')
- player_code = player_code.replace("'+d+'", real_id)
+class NownessPlaylistIE(NownessBaseIE):
+ IE_NAME = 'nowness:playlist'
+ _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues',
+ 'info_dict': {
+ 'id': '3286',
+ },
+ 'playlist_mincount': 8,
+ }
- bc_url = BrightcoveIE._extract_brightcove_url(player_code)
- if bc_url is None:
- raise ExtractorError('Could not find player definition')
- return {
- '_type': 'url',
- 'url': bc_url,
- 'ie_key': 'Brightcove',
- }
+ def _real_extract(self, url):
+ playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s')
+ entries = [self._extract_url_result(item) for item in playlist['items']]
+ return self.playlist_result(entries, playlist_id)
+
+
+class NownessSeriesIE(NownessBaseIE):
+ IE_NAME = 'nowness:series'
+ _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P<id>[^/]+?)(?:$|[?#])'
+ _TEST = {
+ 'url': 'https://www.nowness.com/series/60-seconds',
+ 'info_dict': {
+ 'id': '60',
+ 'title': '60 Seconds',
+ 'description': 'One-minute wisdom in a new NOWNESS series',
+ },
+ 'playlist_mincount': 4,
+ }
+
+ def _real_extract(self, url):
+ display_id, series = self._api_request(url, 'series/getBySlug/%s')
+ entries = [self._extract_url_result(post) for post in series['posts']]
+ series_title = None
+ series_description = None
+ translations = series.get('translations', [])
+ if translations:
+ series_title = translations[0].get('title') or translations[0]['seoTitle']
+ series_description = translations[0].get('seoDescription')
+ return self.playlist_result(
+ entries, compat_str(series['id']), series_title, series_description)
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
index c8257719f..b0bdffc4e 100644
--- a/youtube_dl/extractor/nowtv.py
+++ b/youtube_dl/extractor/nowtv.py
@@ -167,8 +167,8 @@ class NowTVIE(InfoExtractor):
'app': app,
'play_path': 'mp4:%s' % play_path,
'ext': 'flv',
- 'page_url': url,
- 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf',
+ 'page_url': 'http://rtlnow.rtl.de',
+ 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf',
'tbr': int_or_none(item.get('bitrate')),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
index 17baa9679..57ee3d366 100644
--- a/youtube_dl/extractor/nowvideo.py
+++ b/youtube_dl/extractor/nowvideo.py
@@ -7,9 +7,9 @@ class NowVideoIE(NovaMovIE):
IE_NAME = 'nowvideo'
IE_DESC = 'NowVideo'
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|ec|sx|eu|at|ag|co|li)'}
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'}
- _HOST = 'www.nowvideo.ch'
+ _HOST = 'www.nowvideo.to'
_FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
_FILEKEY_REGEX = r'var fkzd="([^"]+)";'
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index d066a96db..8ac38a174 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
float_or_none,
@@ -49,7 +50,7 @@ class NRKIE(InfoExtractor):
if data['usageRights']['isGeoBlocked']:
raise ExtractorError(
- 'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge',
+ 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
expected=True)
video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81'
@@ -196,20 +197,6 @@ class NRKTVIE(InfoExtractor):
}
]
- def _debug_print(self, txt):
- if self._downloader.params.get('verbose', False):
- self.to_screen('[debug] %s' % txt)
-
- def _get_subtitles(self, subtitlesurl, video_id, baseurl):
- url = "%s%s" % (baseurl, subtitlesurl)
- self._debug_print('%s: Subtitle url: %s' % (video_id, url))
- captions = self._download_xml(
- url, video_id, 'Downloading subtitles')
- lang = captions.get('lang', 'no')
- return {lang: [
- {'ext': 'ttml', 'url': url},
- ]}
-
def _extract_f4m(self, manifest_url, video_id):
return self._extract_f4m_formats(
manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
@@ -218,7 +205,7 @@ class NRKTVIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
part_id = mobj.group('part_id')
- baseurl = mobj.group('baseurl')
+ base_url = mobj.group('baseurl')
webpage = self._download_webpage(url, video_id)
@@ -278,11 +265,14 @@ class NRKTVIE(InfoExtractor):
self._sort_formats(formats)
subtitles_url = self._html_search_regex(
- r'data-subtitlesurl[ ]*=[ ]*"([^"]+)"',
- webpage, 'subtitle URL', default=None)
- subtitles = None
+ r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
+ webpage, 'subtitle URL', default=None, group='url')
+ subtitles = {}
if subtitles_url:
- subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl)
+ subtitles['no'] = [{
+ 'ext': 'ttml',
+ 'url': compat_urlparse.urljoin(base_url, subtitles_url),
+ }]
return {
'id': video_id,
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index 66520c2c5..184c7a323 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
+ ExtractorError,
unified_strdate,
int_or_none,
qualities,
@@ -12,7 +13,7 @@ from ..utils import (
class OdnoklassnikiIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
_TESTS = [{
# metadata in JSON
'url': 'http://ok.ru/video/20079905452',
@@ -28,6 +29,7 @@ class OdnoklassnikiIE(InfoExtractor):
'like_count': int,
'age_limit': 0,
},
+ 'skip': 'Video has been blocked',
}, {
# metadataUrl
'url': 'http://ok.ru/video/63567059965189-0',
@@ -64,6 +66,9 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
'url': 'http://www.ok.ru/video/20648036891',
'only_matching': True,
+ }, {
+ 'url': 'http://www.ok.ru/videoembed/20648036891',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -72,6 +77,12 @@ class OdnoklassnikiIE(InfoExtractor):
webpage = self._download_webpage(
'http://ok.ru/video/%s' % video_id, video_id)
+ error = self._search_regex(
+ r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
+
player = self._parse_json(
unescapeHTML(self._search_regex(
r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py
deleted file mode 100644
index d2ceedd01..000000000
--- a/youtube_dl/extractor/openfilm.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote_plus
-from ..utils import (
- parse_iso8601,
- parse_age_limit,
- int_or_none,
-)
-
-
-class OpenFilmIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)'
- _TEST = {
- 'url': 'http://www.openfilm.com/videos/human-resources-remastered',
- 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37',
- 'info_dict': {
- 'id': '32736',
- 'display_id': 'human-resources-remastered',
- 'ext': 'mp4',
- 'title': 'Human Resources (Remastered)',
- 'description': 'Social Engineering in the 20th Century.',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 7164,
- 'timestamp': 1334756988,
- 'upload_date': '20120418',
- 'uploader_id': '41117',
- 'view_count': int,
- 'age_limit': 0,
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- player = compat_urllib_parse_unquote_plus(
- self._og_search_video_url(webpage))
-
- video = json.loads(self._search_regex(
- r'\bp=({.+?})(?:&|$)', player, 'video JSON'))
-
- video_url = '%s1.mp4' % video['location']
- video_id = video.get('video_id')
- display_id = video.get('alias') or display_id
- title = video.get('title')
- description = video.get('description')
- thumbnail = video.get('main_thumb')
- duration = int_or_none(video.get('duration'))
- timestamp = parse_iso8601(video.get('dt_published'), ' ')
- uploader_id = video.get('user_id')
- view_count = int_or_none(video.get('views_count'))
- age_limit = parse_age_limit(video.get('age_limit'))
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'url': video_url,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'timestamp': timestamp,
- 'uploader_id': uploader_id,
- 'view_count': view_count,
- 'age_limit': age_limit,
- }
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 683c81de3..8fb9b1849 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -8,6 +8,7 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
+ strip_jsonp,
unified_strdate,
US_RATINGS,
)
@@ -108,12 +109,12 @@ class PBSIE(InfoExtractor):
{
'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
'info_dict': {
- 'id': '2280706814',
+ 'id': '2276541483',
'display_id': 'player',
'ext': 'mp4',
- 'title': 'American Experience - Death and the Civil War',
+ 'title': 'American Experience - Death and the Civil War, Chapter 1',
'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
- 'duration': 6705,
+ 'duration': 682,
'thumbnail': 're:^https?://.*\.jpg$',
},
'params': {
@@ -134,8 +135,49 @@ class PBSIE(InfoExtractor):
'params': {
'skip_download': True, # requires ffmpeg
},
+ 'skip': 'Expired',
+ },
+ {
+ # Video embedded in iframe containing angle brackets as attribute's value (e.g.
+ # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
+ # https://github.com/rg3/youtube-dl/issues/7059)
+ 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
+ 'info_dict': {
+ 'id': '2365546844',
+ 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
+ 'ext': 'mp4',
+ 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
+ 'description': 'md5:61db2ddf27c9912f09c241014b118ed1',
+ 'duration': 1480,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
+ # Frontline video embedded via flp2012.js
+ 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists',
+ 'info_dict': {
+ 'id': '2070868960',
+ 'display_id': 'the-atomic-artists',
+ 'ext': 'mp4',
+ 'title': 'FRONTLINE - The Atomic Artists',
+ 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+ 'duration': 723,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}
]
+ _ERRORS = {
+ 101: 'We\'re sorry, but this video is not yet available.',
+ 403: 'We\'re sorry, but this video is not available in your region due to right restrictions.',
+ 404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.',
+ 410: 'This video has expired and is no longer available for online streaming.',
+ }
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -166,9 +208,30 @@ class PBSIE(InfoExtractor):
if media_id:
return media_id, presumptive_id, upload_date
- url = self._search_regex(
- r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
- webpage, 'player URL')
+ # Fronline video embedded via flp
+ video_id = self._search_regex(
+ r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)
+ if video_id:
+ # pkg_id calculation is reverse engineered from
+ # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js
+ prg_id = self._search_regex(
+ r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:]
+ if 'q' in prg_id:
+ prg_id = prg_id.split('q')[1]
+ prg_id = int(prg_id, 16)
+ getdir = self._download_json(
+ 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
+ presumptive_id, 'Downloading getdir JSON',
+ transform_source=strip_jsonp)
+ return getdir['mid'], presumptive_id, upload_date
+
+ for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
+ url = self._search_regex(
+ r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe,
+ 'player URL', default=None, group='url')
+ if url:
+ break
+
mobj = re.match(self._VALID_URL, url)
player_id = mobj.group('player_id')
@@ -213,13 +276,11 @@ class PBSIE(InfoExtractor):
'Downloading %s video url info' % encoding_name)
if redirect_info['status'] == 'error':
- if redirect_info['http_code'] == 403:
- message = (
- 'The video is not available in your region due to '
- 'right restrictions')
- else:
- message = redirect_info['message']
- raise ExtractorError(message, expected=True)
+ raise ExtractorError(
+ '%s said: %s' % (
+ self.IE_NAME,
+ self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])),
+ expected=True)
format_url = redirect_info.get('url')
if not format_url:
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index 8ad936758..887c8020d 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -12,7 +12,8 @@ from ..utils import parse_iso8601
class PeriscopeIE(InfoExtractor):
IE_DESC = 'Periscope'
_VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)'
- _TEST = {
+ # Alive example URLs can be found here http://onperiscope.com/
+ _TESTS = [{
'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
'md5': '65b57957972e503fcbbaeed8f4fa04ca',
'info_dict': {
@@ -25,11 +26,15 @@ class PeriscopeIE(InfoExtractor):
'uploader_id': '1465763',
},
'skip': 'Expires in 24 hours',
- }
+ }, {
+ 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',
+ 'only_matching': True,
+ }]
- def _call_api(self, method, token):
+ def _call_api(self, method, value):
+ attribute = 'token' if len(value) > 13 else 'broadcast_id'
return self._download_json(
- 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token)
+ 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value)
def _real_extract(self, url):
token = self._match_id(url)
diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py
index bdc71017b..6d138ef25 100644
--- a/youtube_dl/extractor/playwire.py
+++ b/youtube_dl/extractor/playwire.py
@@ -19,7 +19,7 @@ class PlaywireIE(InfoExtractor):
'id': '3353705',
'ext': 'mp4',
'title': 'S04_RM_UCL_Rus',
- 'thumbnail': 're:^http://.*\.png$',
+ 'thumbnail': 're:^https?://.*\.png$',
'duration': 145.94,
},
}, {
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 7b0cdc41a..a656ad85a 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -20,7 +20,7 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+ _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
_TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': '882f488fa1f0026f023f33576004a2ed',
@@ -34,6 +34,9 @@ class PornHubIE(InfoExtractor):
}, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True,
+ }, {
+ 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
+ 'only_matching': True,
}]
@classmethod
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index effcf1db3..baa54a3af 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -20,7 +20,7 @@ from ..utils import (
class ProSiebenSat1IE(InfoExtractor):
IE_NAME = 'prosiebensat1'
IE_DESC = 'ProSiebenSat.1 Digital'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)'
_TESTS = [
{
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 1654a641f..c98539f6a 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -25,7 +25,7 @@ class QQMusicIE(InfoExtractor):
'id': '004295Et37taLD',
'ext': 'mp3',
'title': '可惜没如果',
- 'upload_date': '20141227',
+ 'release_date': '20141227',
'creator': '林俊杰',
'description': 'md5:d327722d0361576fde558f1ac68a7065',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -38,11 +38,26 @@ class QQMusicIE(InfoExtractor):
'id': '004MsGEo3DdNxV',
'ext': 'mp3',
'title': '如果',
- 'upload_date': '20050626',
+ 'release_date': '20050626',
'creator': '李季美',
'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
'thumbnail': 're:^https?://.*\.jpg$',
}
+ }, {
+ 'note': 'lyrics not in .lrc format',
+ 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6',
+ 'info_dict': {
+ 'id': '001JyApY11tIp6',
+ 'ext': 'mp3',
+ 'title': 'Shadows Over Transylvania',
+ 'release_date': '19970225',
+ 'creator': 'Dark Funeral',
+ 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
_FORMATS = {
@@ -112,15 +127,27 @@ class QQMusicIE(InfoExtractor):
self._check_formats(formats, mid)
self._sort_formats(formats)
- return {
+ actual_lrc_lyrics = ''.join(
+ line + '\n' for line in re.findall(
+ r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content))
+
+ info_dict = {
'id': mid,
'formats': formats,
'title': song_name,
- 'upload_date': publish_time,
+ 'release_date': publish_time,
'creator': singer,
'description': lrc_content,
- 'thumbnail': thumbnail_url,
+ 'thumbnail': thumbnail_url
}
+ if actual_lrc_lyrics:
+ info_dict['subtitles'] = {
+ 'origin': [{
+ 'ext': 'lrc',
+ 'data': actual_lrc_lyrics,
+ }]
+ }
+ return info_dict
class QQPlaylistBaseIE(InfoExtractor):
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index 1631faf29..7ff1d06c4 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
)
from ..utils import (
parse_duration,
@@ -72,6 +73,18 @@ class RaiIE(InfoExtractor):
'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!',
'uploader': 'RaiTre',
}
+ },
+ {
+ 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
+ 'md5': '037104d2c14132887e5e4cf114569214',
+ 'info_dict': {
+ 'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e',
+ 'ext': 'flv',
+ 'title': 'Il pacco',
+ 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
+ 'uploader': 'RaiTre',
+ 'upload_date': '20141221',
+ },
}
]
@@ -90,11 +103,14 @@ class RaiIE(InfoExtractor):
relinker_url = self._extract_relinker_url(webpage)
if not relinker_url:
- iframe_path = self._search_regex(
- r'<iframe[^>]+src="/?(dl/[^"]+\?iframe\b[^"]*)"',
+ iframe_url = self._search_regex(
+ [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
+ r'drawMediaRaiTV\(["\'](.+?)["\']'],
webpage, 'iframe')
+ if not iframe_url.startswith('http'):
+ iframe_url = compat_urlparse.urljoin(url, iframe_url)
webpage = self._download_webpage(
- '%s/%s' % (host, iframe_path), video_id)
+ iframe_url, video_id)
relinker_url = self._extract_relinker_url(webpage)
relinker = self._download_json(
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
index e4215d546..e42b319a3 100644
--- a/youtube_dl/extractor/rtbf.py
+++ b/youtube_dl/extractor/rtbf.py
@@ -9,8 +9,8 @@ from ..utils import (
class RTBFIE(InfoExtractor):
- _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
'md5': '799f334ddf2c0a582ba80c44655be570',
'info_dict': {
@@ -19,7 +19,14 @@ class RTBFIE(InfoExtractor):
'title': 'Les Diables au coeur (épisode 2)',
'duration': 3099,
}
- }
+ }, {
+ # geo restricted
+ 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858',
+ 'only_matching': True,
+ }]
_QUALITIES = [
('mobile', 'mobile'),
@@ -36,7 +43,7 @@ class RTBFIE(InfoExtractor):
data = self._parse_json(
unescapeHTML(self._search_regex(
- r'data-video="([^"]+)"', webpage, 'data video')),
+ r'data-media="([^"]+)"', webpage, 'data video')),
video_id)
if data.get('provider').lower() == 'youtube':
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
index 04158b993..d9cfbf180 100644
--- a/youtube_dl/extractor/rte.py
+++ b/youtube_dl/extractor/rte.py
@@ -9,16 +9,16 @@ from ..utils import (
class RteIE(InfoExtractor):
- _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://www.rte.ie/player/de/show/10363114/',
+ 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/',
'info_dict': {
- 'id': '10363114',
+ 'id': '10478715',
'ext': 'mp4',
- 'title': 'One News',
+ 'title': 'Watch iWitness online',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'The One O\'Clock News followed by Weather.',
- 'duration': 436.844,
+ 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.',
+ 'duration': 60.046,
},
'params': {
'skip_download': 'f4m fails with --test atm'
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
index c67ad25ce..e417bf661 100644
--- a/youtube_dl/extractor/ruutu.py
+++ b/youtube_dl/extractor/ruutu.py
@@ -57,16 +57,21 @@ class RuutuIE(InfoExtractor):
extract_formats(child)
elif child.tag.endswith('File'):
video_url = child.text
- if not video_url or video_url in processed_urls or 'NOT_USED' in video_url:
+ if (not video_url or video_url in processed_urls or
+ any(p in video_url for p in ('NOT_USED', 'NOT-USED'))):
return
processed_urls.append(video_url)
ext = determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', m3u8_id='hls'))
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- video_url, video_id, f4m_id='hds'))
+ f4m_formats = self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
else:
proto = compat_urllib_parse_urlparse(video_url).scheme
if not child.tag.startswith('HTTP') and proto != 'rtmp':
@@ -74,7 +79,7 @@ class RuutuIE(InfoExtractor):
preference = -1 if proto == 'rtmp' else 1
label = child.get('label')
tbr = int_or_none(child.get('bitrate'))
- width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+ width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]]
formats.append({
'format_id': '%s-%s' % (proto, label if label else tbr),
'url': video_url,
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index 9c53704ea..474ebb49b 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -121,9 +121,9 @@ class SenateISVPIE(InfoExtractor):
'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
}]
else:
- hdcore_sign = '?hdcore=3.1.0'
+ hdcore_sign = 'hdcore=3.1.0'
url_params = (domain, video_id, stream_num)
- f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign
+ f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign
m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
# URLs without the extra param induce an 404 error
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
index 6e9903d5e..f76fb12c0 100644
--- a/youtube_dl/extractor/shahid.py
+++ b/youtube_dl/extractor/shahid.py
@@ -16,7 +16,7 @@ class ShahidIE(InfoExtractor):
'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html',
'info_dict': {
'id': '90574',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3',
'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان',
'duration': 2972,
@@ -81,7 +81,7 @@ class ShahidIE(InfoExtractor):
compat_urllib_parse.urlencode({
'apiKey': 'sh@hid0nlin3',
'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
- }).encode('utf-8')),
+ })),
video_id, 'Downloading video JSON')
video = video[api_vars['playerType']]
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index ed5dcc0d3..2b60d354a 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -113,7 +113,7 @@ class SoundcloudIE(InfoExtractor):
},
]
- _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+ _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
def report_resolve(self, video_id):
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index 27f4033c5..034bd47ff 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -77,17 +77,21 @@ class SpiegeltvIE(InfoExtractor):
'rtmp_live': True,
})
elif determine_ext(endpoint) == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- endpoint.replace('[video]', play_path),
- video_id, 'm4v',
- preference=1, # Prefer hls since it allows to workaround georestriction
- m3u8_id='hls', fatal=False)
- if m3u8_formats is not False:
- formats.extend(m3u8_formats)
+ formats.append({
+ 'url': endpoint.replace('[video]', play_path),
+ 'ext': 'm4v',
+ 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction
+ 'protocol': 'm3u8',
+ 'preference': 1,
+ 'http_headers': {
+ 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side
+ },
+ })
else:
formats.append({
'url': endpoint,
})
+ self._check_formats(formats, video_id)
thumbnails = []
for image in media_json['images']:
diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py
new file mode 100644
index 000000000..d5c852f52
--- /dev/null
+++ b/youtube_dl/extractor/stitcher.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ unescapeHTML,
+)
+
+
+class StitcherIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
+ _TESTS = [{
+ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
+ 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
+ 'info_dict': {
+ 'id': '40789481',
+ 'ext': 'mp3',
+ 'title': 'Machine Learning Mastery and Cancer Clusters',
+ 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
+ 'duration': 1604,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
+ 'info_dict': {
+ 'id': '40846275',
+ 'display_id': 'the-rare-hourlong-comedy-plus',
+ 'ext': 'mp3',
+ 'title': "The CW's 'Crazy Ex-Girlfriend'",
+ 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
+ 'duration': 2235,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # escaped title
+ 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ audio_id = mobj.group('id')
+ display_id = mobj.group('display_id') or audio_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ episode = self._parse_json(
+ js_to_json(self._search_regex(
+ r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')),
+ display_id)['config']['episode']
+
+ title = unescapeHTML(episode['title'])
+ formats = [{
+ 'url': episode[episode_key],
+ 'ext': determine_ext(episode[episode_key]) or 'mp3',
+ 'vcodec': 'none',
+ } for episode_key in ('episodeURL',) if episode.get(episode_key)]
+ description = self._search_regex(
+ r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
+ duration = int_or_none(episode.get('duration'))
+ thumbnail = episode.get('episodeImage')
+
+ return {
+ 'id': audio_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
index f1f43d0a7..744f9db38 100644
--- a/youtube_dl/extractor/tapely.py
+++ b/youtube_dl/extractor/tapely.py
@@ -16,7 +16,7 @@ from ..utils import (
class TapelyIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
+ _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
_API_URL = 'http://tape.ly/showtape?id={0:}'
_S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}'
_SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}'
@@ -42,6 +42,10 @@ class TapelyIE(InfoExtractor):
'ext': 'm4a',
},
},
+ {
+ 'url': 'https://tapely.com/my-grief-as-told-by-water',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index ae94f055c..2c8e9b941 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -1,24 +1,51 @@
# coding: utf-8
from __future__ import unicode_literals
-from .mitele import MiTeleIE
+import json
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urlparse,
+)
+from ..utils import (
+ get_element_by_attribute,
+ parse_duration,
+ strip_jsonp,
+)
-class TelecincoIE(MiTeleIE):
- IE_NAME = 'telecinco.es'
- _VALID_URL = r'https?://www\.telecinco\.es/(?:[^/]+/)+(?P<id>.+?)\.html'
+
+class TelecincoIE(InfoExtractor):
+ IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
+ _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
+ 'md5': '5cbef3ad5ef17bf0d21570332d140729',
'info_dict': {
'id': 'MDSVID20141015_0058',
'ext': 'mp4',
'title': 'Con Martín Berasategui, hacer un bacalao al ...',
'duration': 662,
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ }, {
+ 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
+ 'md5': '0a5b9f3cc8b074f50a0578f823a12694',
+ 'info_dict': {
+ 'id': 'MDSVID20150916_0128',
+ 'ext': 'mp4',
+ 'title': '¿Quién es este ex futbolista con el que hablan ...',
+ 'duration': 79,
+ },
+ }, {
+ 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
+ 'md5': 'ad1bfaaba922dd4a295724b05b68f86a',
+ 'info_dict': {
+ 'id': 'MDSVID20150513_0220',
+ 'ext': 'mp4',
+ 'title': '#DOYLACARA. Con la trata no hay trato',
+ 'duration': 50,
},
}, {
'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
@@ -27,3 +54,41 @@ class TelecincoIE(MiTeleIE):
'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
'only_matching': True,
}]
+
+ def _real_extract(self, url):
+ episode = self._match_id(url)
+ webpage = self._download_webpage(url, episode)
+ embed_data_json = self._search_regex(
+ r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
+ ).replace('\'', '"')
+ embed_data = json.loads(embed_data_json)
+
+ domain = embed_data['mediaUrl']
+ if not domain.startswith('http'):
+ # only happens in telecinco.es videos
+ domain = 'http://' + domain
+ info_url = compat_urlparse.urljoin(
+ domain,
+ compat_urllib_parse_unquote(embed_data['flashvars']['host'])
+ )
+ info_el = self._download_xml(info_url, episode).find('./video/info')
+
+ video_link = info_el.find('videoUrl/link').text
+ token_query = compat_urllib_parse.urlencode({'id': video_link})
+ token_info = self._download_json(
+ embed_data['flashvars']['ov_tk'] + '?' + token_query,
+ episode,
+ transform_source=strip_jsonp
+ )
+ formats = self._extract_m3u8_formats(
+ token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native')
+
+ return {
+ 'id': embed_data['videoId'],
+ 'display_id': episode,
+ 'title': info_el.find('title').text,
+ 'formats': formats,
+ 'description': get_element_by_attribute('class', 'text', webpage),
+ 'thumbnail': info_el.find('thumb').text,
+ 'duration': parse_duration(info_el.find('duration').text),
+ }
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index 84fe71aef..5f7ac4b35 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -2,14 +2,12 @@
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..compat import compat_str
class TudouIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
_TESTS = [{
'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
'md5': '140a49ed444bd22f93330985d8475fcb',
@@ -27,41 +25,41 @@ class TudouIE(InfoExtractor):
'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
'thumbnail': 're:^https?://.*\.jpg$',
}
+ }, {
+ 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html',
+ 'only_matching': True,
}]
_PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
- def _url_for_id(self, id, quality=None):
- info_url = "http://v2.tudou.com/f?id=" + str(id)
+ def _url_for_id(self, video_id, quality=None):
+ info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
if quality:
info_url += '&hd' + quality
- webpage = self._download_webpage(info_url, id, "Opening the info webpage")
- final_url = self._html_search_regex('>(.+?)</f>', webpage, 'video url')
+ xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
+ final_url = xml_data.text
return final_url
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
- if m and m.group(1):
- return {
- '_type': 'url',
- 'url': 'youku:' + m.group(1),
- 'ie_key': 'Youku'
- }
+ youku_vcode = self._search_regex(
+ r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None)
+ if youku_vcode:
+ return self.url_result('youku:' + youku_vcode, ie='Youku')
title = self._search_regex(
- r",kw:\s*['\"](.+?)[\"']", webpage, 'title')
+ r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')
thumbnail_url = self._search_regex(
- r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False)
+ r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)
player_url = self._search_regex(
- r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']",
+ r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',
webpage, 'player URL', default=self._PLAYER_URL)
- segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
- segments = json.loads(segs_json)
+ segments = self._parse_json(self._search_regex(
+ r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)
# It looks like the keys are the arguments that have to be passed as
# the hd field in the request url, we pick the higher
# Also, filter non-number qualities (see issue #3643).
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 3d3b635e4..4f844706d 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import int_or_none
class TumblrIE(InfoExtractor):
@@ -29,6 +30,19 @@ class TumblrIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
}
}, {
+ 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video',
+ 'md5': '7ae503065ad150122dc3089f8cf1546c',
+ 'info_dict': {
+ 'id': '130323439814',
+ 'ext': 'mp4',
+ 'title': 'HD Video Testing \u2014 Test description for my HD video',
+ 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ 'params': {
+ 'format': 'hd',
+ },
+ }, {
'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
'info_dict': {
@@ -37,6 +51,9 @@ class TumblrIE(InfoExtractor):
'title': 'naked smoking & stretching',
'upload_date': '20150506',
'timestamp': 1430931613,
+ 'age_limit': 18,
+ 'uploader_id': '1638622',
+ 'uploader': 'naked-yogi',
},
'add_ie': ['Vidme'],
}, {
@@ -66,10 +83,38 @@ class TumblrIE(InfoExtractor):
if iframe_url is None:
return self.url_result(urlh.geturl(), 'Generic')
- iframe = self._download_webpage(iframe_url, video_id,
- 'Downloading iframe page')
- video_url = self._search_regex(r'<source src="([^"]+)"',
- iframe, 'video url')
+ iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page')
+
+ duration = None
+ sources = []
+
+ sd_url = self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe,
+ 'sd video url', default=None, group='url')
+ if sd_url:
+ sources.append((sd_url, 'sd'))
+
+ options = self._parse_json(
+ self._search_regex(
+ r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe,
+ 'hd video url', default='', group='options'),
+ video_id, fatal=False)
+ if options:
+ duration = int_or_none(options.get('duration'))
+ hd_url = options.get('hdUrl')
+ if hd_url:
+ sources.append((hd_url, 'hd'))
+
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ 'height': int_or_none(self._search_regex(
+ r'/(\d{3,4})$', video_url, 'height', default=None)),
+ 'quality': quality,
+ } for quality, (video_url, format_id) in enumerate(sources)]
+
+ self._sort_formats(formats)
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
@@ -79,9 +124,9 @@ class TumblrIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
'title': video_title,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index fad720b68..822372ea1 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -10,10 +10,10 @@ class TutvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'
_TEST = {
'url': 'http://tu.tv/videos/robots-futbolistas',
- 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7',
+ 'md5': '0cd9e28ad270488911b0d2a72323395d',
'info_dict': {
'id': '2973058',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Robots futbolistas',
},
}
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 023911c41..3ec08b674 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -15,6 +15,7 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
int_or_none,
parse_duration,
@@ -27,8 +28,7 @@ class TwitchBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'http://usher.twitch.tv'
- _LOGIN_URL = 'https://secure.twitch.tv/login'
- _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new'
+ _LOGIN_URL = 'http://www.twitch.tv/login'
_NETRC_MACHINE = 'twitch'
def _handle_error(self, response):
@@ -61,26 +61,28 @@ class TwitchBaseIE(InfoExtractor):
if username is None:
return
- login_page = self._download_webpage(
+ login_page, handle = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login page')
login_form = self._hidden_inputs(login_page)
login_form.update({
- 'login': username.encode('utf-8'),
- 'password': password.encode('utf-8'),
+ 'username': username,
+ 'password': password,
})
+ redirect_url = handle.geturl()
+
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
- 'post url', default=self._LOGIN_POST_URL, group='url')
+ 'post url', default=redirect_url, group='url')
if not post_url.startswith('http'):
- post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+ post_url = compat_urlparse.urljoin(redirect_url, post_url)
request = compat_urllib_request.Request(
- post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- request.add_header('Referer', self._LOGIN_URL)
+ post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8'))
+ request.add_header('Referer', redirect_url)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
@@ -238,14 +240,24 @@ class TwitchVodIE(TwitchItemBaseIE):
def _real_extract(self, url):
item_id = self._match_id(url)
+
info = self._download_info(self._ITEM_SHORTCUT, item_id)
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % self._ITEM_TYPE)
+
formats = self._extract_m3u8_formats(
- '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
- % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
+ '%s/vod/%s?%s' % (
+ self._USHER_BASE, item_id,
+ compat_urllib_parse.urlencode({
+ 'allow_source': 'true',
+ 'allow_spectre': 'true',
+ 'player': 'twitchweb',
+ 'nauth': access_token['token'],
+ 'nauthsig': access_token['sig'],
+ })),
item_id, 'mp4')
+
self._prefer_source(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index 1aaa06305..2bd5946ac 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -6,23 +7,53 @@ from .common import InfoExtractor
from ..compat import compat_urllib_request
from ..utils import (
float_or_none,
- unescapeHTML,
+ xpath_text,
+ remove_end,
+ int_or_none,
+ ExtractorError,
)
class TwitterCardIE(InfoExtractor):
+ IE_NAME = 'twitter:card'
_VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
- 'md5': 'a74f50b310c83170319ba16de6955192',
- 'info_dict': {
- 'id': '560070183650213889',
- 'ext': 'mp4',
- 'title': 'TwitterCard',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 30.033,
+ _TESTS = [
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+ 'md5': '4fa26a35f9d1bf4b646590ba8e84be19',
+ 'info_dict': {
+ 'id': '560070183650213889',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 30.033,
+ }
},
- }
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
+ 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8',
+ 'info_dict': {
+ 'id': '623160978427936768',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 80.155,
+ },
+ },
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
+ 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+ 'info_dict': {
+ 'id': 'dq4Oj5quskI',
+ 'ext': 'mp4',
+ 'title': 'Ubuntu 11.10 Overview',
+ 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+ 'upload_date': '20111013',
+ 'uploader': 'OMG! Ubuntu!',
+ 'uploader_id': 'omgubuntu',
+ },
+ }
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -40,10 +71,24 @@ class TwitterCardIE(InfoExtractor):
request.add_header('User-Agent', user_agent)
webpage = self._download_webpage(request, video_id)
- config = self._parse_json(
- unescapeHTML(self._search_regex(
- r'data-player-config="([^"]+)"', webpage, 'data player config')),
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+ webpage, 'youtube iframe', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
+
+ config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'data player config'),
video_id)
+ if 'playlist' not in config:
+ if 'vmapUrl' in config:
+ vmap_data = self._download_xml(config['vmapUrl'], video_id)
+ video_url = xpath_text(vmap_data, './/MediaFile').strip()
+ formats.append({
+ 'url': video_url,
+ })
+ break # same video regardless of UA
+ continue
video_url = config['playlist'][0]['source']
@@ -70,3 +115,100 @@ class TwitterCardIE(InfoExtractor):
'duration': duration,
'formats': formats,
}
+
+
+class TwitterIE(InfoExtractor):
+ IE_NAME = 'twitter'
+ _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'
+ _TEMPLATE_URL = 'https://twitter.com/%s/status/%s'
+
+ _TESTS = [{
+ 'url': 'https://twitter.com/freethenipple/status/643211948184596480',
+ 'md5': 'db6612ec5d03355953c3ca9250c97e5e',
+ 'info_dict': {
+ 'id': '643211948184596480',
+ 'ext': 'mp4',
+ 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 12.922,
+ 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"',
+ 'uploader': 'FREE THE NIPPLE',
+ 'uploader_id': 'freethenipple',
+ },
+ }, {
+ 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
+ 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
+ 'info_dict': {
+ 'id': '657991469417025536',
+ 'ext': 'mp4',
+ 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai',
+ 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"',
+ 'thumbnail': 're:^https?://.*\.png',
+ 'uploader': 'Gifs',
+ 'uploader_id': 'giphz',
+ },
+ }, {
+ 'url': 'https://twitter.com/starwars/status/665052190608723968',
+ 'md5': '39b7199856dee6cd4432e72c74bc69d4',
+ 'info_dict': {
+ 'id': '665052190608723968',
+ 'ext': 'mp4',
+ 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.',
+ 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."',
+ 'uploader_id': 'starwars',
+ 'uploader': 'Star Wars',
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('user_id')
+ twid = mobj.group('id')
+
+ webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+
+ username = remove_end(self._og_search_title(webpage), ' on Twitter')
+
+ title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”')
+
+ # strip 'https -_t.co_BJYgOjSeGA' junk from filenames
+ title = re.sub(r'\s+(https?://[^ ]+)', '', title)
+
+ info = {
+ 'uploader_id': user_id,
+ 'uploader': username,
+ 'webpage_url': url,
+ 'description': '%s on Twitter: "%s"' % (username, description),
+ 'title': username + ' - ' + title,
+ }
+
+ card_id = self._search_regex(
+ r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None)
+ if card_id:
+ card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': 'TwitterCard',
+ 'url': card_url,
+ })
+ return info
+
+ mobj = re.search(r'''(?x)
+ <video[^>]+class="animated-gif"[^>]+
+ (?:data-height="(?P<height>\d+)")?[^>]+
+ (?:data-width="(?P<width>\d+)")?[^>]+
+ (?:poster="(?P<poster>[^"]+)")?[^>]*>\s*
+ <source[^>]+video-src="(?P<url>[^"]+)"
+ ''', webpage)
+
+ if mobj:
+ info.update({
+ 'id': twid,
+ 'url': mobj.group('url'),
+ 'height': int_or_none(mobj.group('height')),
+ 'width': int_or_none(mobj.group('width')),
+ 'thumbnail': mobj.group('poster'),
+ })
+ return info
+
+ raise ExtractorError('There\'s not video in this tweet.')
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index c39c278ab..73b05ecab 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -1,17 +1,20 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+)
class UstreamIE(InfoExtractor):
- _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)'
+ _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
IE_NAME = 'ustream'
_TESTS = [{
'url': 'http://www.ustream.tv/recorded/20274954',
@@ -19,8 +22,12 @@ class UstreamIE(InfoExtractor):
'info_dict': {
'id': '20274954',
'ext': 'flv',
- 'uploader': 'Young Americans for Liberty',
'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',
+ 'description': 'Young Americans for Liberty February 7, 2012 2:28 AM',
+ 'timestamp': 1328577035,
+ 'upload_date': '20120207',
+ 'uploader': 'yaliberty',
+ 'uploader_id': '6780869',
},
}, {
# From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444
@@ -32,20 +39,21 @@ class UstreamIE(InfoExtractor):
'ext': 'flv',
'title': '-CG11- Canada Games Figure Skating',
'uploader': 'sportscanadatv',
- }
+ },
+ 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.',
}]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
- video_id = m.group('videoID')
+ video_id = m.group('id')
# some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990)
if m.group('type') == 'embed/recorded':
- video_id = m.group('videoID')
+ video_id = m.group('id')
desktop_url = 'http://www.ustream.tv/recorded/' + video_id
return self.url_result(desktop_url, 'Ustream')
if m.group('type') == 'embed':
- video_id = m.group('videoID')
+ video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
desktop_video_id = self._html_search_regex(
r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id')
@@ -53,52 +61,50 @@ class UstreamIE(InfoExtractor):
return self.url_result(desktop_url, 'Ustream')
params = self._download_json(
- 'http://cdngw.ustream.tv/rgwjson/Viewer.getVideo/' + json.dumps({
- 'brandId': 1,
- 'videoId': int(video_id),
- 'autoplay': False,
- }), video_id)
-
- if 'error' in params:
- raise ExtractorError(params['error']['message'], expected=True)
-
- video_url = params['flv']
+ 'https://api.ustream.tv/videos/%s.json' % video_id, video_id)
- webpage = self._download_webpage(url, video_id)
+ error = params.get('error')
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error), expected=True)
- self.report_extraction(video_id)
+ video = params['video']
- video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
- webpage, 'title', default=None)
+ title = video['title']
+ filesize = float_or_none(video.get('file_size'))
- if not video_title:
- try:
- video_title = params['moduleConfig']['meta']['title']
- except KeyError:
- pass
-
- if not video_title:
- video_title = 'Ustream video ' + video_id
+ formats = [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': format_id,
+ 'filesize': filesize,
+ } for format_id, video_url in video['media_urls'].items()]
+ self._sort_formats(formats)
- uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
- webpage, 'uploader', fatal=False, flags=re.DOTALL, default=None)
+ description = video.get('description')
+ timestamp = int_or_none(video.get('created_at'))
+ duration = float_or_none(video.get('length'))
+ view_count = int_or_none(video.get('views'))
- if not uploader:
- try:
- uploader = params['moduleConfig']['meta']['userName']
- except KeyError:
- uploader = None
+ uploader = video.get('owner', {}).get('username')
+ uploader_id = video.get('owner', {}).get('id')
- thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
- webpage, 'thumbnail', fatal=False)
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()]
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
'uploader': uploader,
- 'thumbnail': thumbnail,
+ 'uploader_id': uploader_id,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index c17094f81..4c0de354f 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,10 +1,10 @@
from __future__ import unicode_literals
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_urllib_request,
)
from ..utils import (
@@ -97,7 +97,7 @@ class VevoIE(InfoExtractor):
if last_version['version'] == -1:
raise ExtractorError('Unable to extract last version of the video')
- renditions = xml.etree.ElementTree.fromstring(last_version['data'])
+ renditions = compat_etree_fromstring(last_version['data'])
formats = []
# Already sorted from worst to best quality
for rend in renditions.findall('rendition'):
@@ -114,7 +114,7 @@ class VevoIE(InfoExtractor):
def _formats_from_smil(self, smil_xml):
formats = []
- smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+ smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))
els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
for el in els:
src = el.attrib['src']
diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py
index 94f9e9be9..cd3f50a63 100644
--- a/youtube_dl/extractor/videofyme.py
+++ b/youtube_dl/extractor/videofyme.py
@@ -2,8 +2,8 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- find_xpath_attr,
int_or_none,
+ parse_iso8601,
)
@@ -18,33 +18,35 @@ class VideofyMeIE(InfoExtractor):
'id': '1100701',
'ext': 'mp4',
'title': 'This is VideofyMe',
- 'description': None,
+ 'description': '',
+ 'upload_date': '20130326',
+ 'timestamp': 1364288959,
'uploader': 'VideofyMe',
'uploader_id': 'thisisvideofyme',
'view_count': int,
+ 'likes': int,
+ 'comment_count': int,
},
-
}
def _real_extract(self, url):
video_id = self._match_id(url)
- config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
- video_id)
- video = config.find('video')
- sources = video.find('sources')
- url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)
- for key in ['on', 'av', 'off']] if node is not None)
- video_url = url_node.find('url').text
- view_count = int_or_none(self._search_regex(
- r'([0-9]+)', video.find('views').text, 'view count', fatal=False))
+
+ config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo']
+
+ video = config.get('video')
+ blog = config.get('blog', {})
return {
'id': video_id,
- 'title': video.find('title').text,
- 'url': video_url,
- 'thumbnail': video.find('thumb').text,
- 'description': video.find('description').text,
- 'uploader': config.find('blog/name').text,
- 'uploader_id': video.find('identifier').text,
- 'view_count': view_count,
+ 'title': video['title'],
+ 'url': video['sources']['source']['url'],
+ 'thumbnail': video.get('thumb'),
+ 'description': video.get('description'),
+ 'timestamp': parse_iso8601(video.get('date')),
+ 'uploader': blog.get('name'),
+ 'uploader_id': blog.get('identifier'),
+ 'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)),
+ 'likes': int_or_none(video.get('likes')),
+ 'comment_count': int_or_none(video.get('nrOfComments')),
}
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
deleted file mode 100644
index ef2da5632..000000000
--- a/youtube_dl/extractor/videolecturesnet.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- find_xpath_attr,
- int_or_none,
- parse_duration,
- unified_strdate,
-)
-
-
-class VideoLecturesNetIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$'
- IE_NAME = 'videolectures.net'
-
- _TEST = {
- 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
- 'info_dict': {
- 'id': 'promogram_igor_mekjavic_eng',
- 'ext': 'mp4',
- 'title': 'Automatics, robotics and biocybernetics',
- 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
- 'upload_date': '20130627',
- 'duration': 565,
- 'thumbnail': 're:http://.*\.jpg',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
- smil = self._download_xml(smil_url, video_id)
-
- title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
- description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract')
- description = (
- None if description_el is None
- else description_el.attrib['content'])
- upload_date = unified_strdate(
- find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
-
- switch = smil.find('.//switch')
- duration = parse_duration(switch.attrib.get('dur'))
- thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
- thumbnail = (
- None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
-
- formats = []
- for v in switch.findall('./video'):
- proto = v.attrib.get('proto')
- if proto not in ['http', 'rtmp']:
- continue
- f = {
- 'width': int_or_none(v.attrib.get('width')),
- 'height': int_or_none(v.attrib.get('height')),
- 'filesize': int_or_none(v.attrib.get('size')),
- 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
- 'ext': v.attrib.get('ext'),
- }
- src = v.attrib['src']
- if proto == 'http':
- if self._is_valid_url(src, video_id):
- f['url'] = src
- formats.append(f)
- elif proto == 'rtmp':
- f.update({
- 'url': v.attrib['streamer'],
- 'play_path': src,
- 'rtmp_real_time': True,
- })
- formats.append(f)
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'upload_date': upload_date,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index 9a794e609..3d63ed4f0 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -14,7 +14,7 @@ class VidmeIE(InfoExtractor):
_VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
_TESTS = [{
'url': 'https://vid.me/QNB',
- 'md5': 'c62f1156138dc3323902188c5b5a8bd6',
+ 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
'info_dict': {
'id': 'QNB',
'ext': 'mp4',
@@ -93,6 +93,39 @@ class VidmeIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # nsfw, user-disabled
+ 'url': 'https://vid.me/dzGJ',
+ 'only_matching': True,
+ }, {
+ # suspended
+ 'url': 'https://vid.me/Ox3G',
+ 'only_matching': True,
+ }, {
+ # deleted
+ 'url': 'https://vid.me/KTPm',
+ 'only_matching': True,
+ }, {
+ # no formats in the API response
+ 'url': 'https://vid.me/e5g',
+ 'info_dict': {
+ 'id': 'e5g',
+ 'ext': 'mp4',
+ 'title': 'Video upload (e5g)',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1401480195,
+ 'upload_date': '20140530',
+ 'uploader': None,
+ 'uploader_id': None,
+ 'age_limit': 0,
+ 'duration': 483,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -114,12 +147,32 @@ class VidmeIE(InfoExtractor):
video = response['video']
+ if video.get('state') == 'deleted':
+ raise ExtractorError(
+ 'Vidme said: Sorry, this video has been deleted.',
+ expected=True)
+
+ if video.get('state') in ('user-disabled', 'suspended'):
+ raise ExtractorError(
+ 'Vidme said: This video has been suspended either due to a copyright claim, '
+ 'or for violating the terms of use.',
+ expected=True)
+
formats = [{
'format_id': f.get('type'),
'url': f['uri'],
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
+ 'preference': 0 if f.get('type', '').endswith('clip') else 1,
} for f in video.get('formats', []) if f.get('uri')]
+
+ if not formats and video.get('complete_url'):
+ formats.append({
+ 'url': video.get('complete_url'),
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ })
+
self._sort_formats(formats)
title = video['title']
@@ -136,7 +189,7 @@ class VidmeIE(InfoExtractor):
return {
'id': video_id,
- 'title': title,
+ 'title': title or 'Video upload (%s)' % video_id,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index 08a5a7b8d..2ba9f31df 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -20,8 +20,14 @@ class VidziIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url')
+ video_host = self._html_search_regex(
+ r'id=\'vplayer\'><img src="http://(.*?)/i', webpage,
+ 'video host')
+ video_hash = self._html_search_regex(
+ r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash')
+ ext = self._html_search_regex(
+ r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext')
+ video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index cda02ba24..7cf930d69 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -3,12 +3,14 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
+ compat_HTTPError,
compat_urllib_request,
compat_urllib_parse,
compat_urllib_parse_unquote,
)
from ..utils import (
determine_ext,
+ ExtractorError,
int_or_none,
parse_iso8601,
HEADRequest,
@@ -16,14 +18,14 @@ from ..utils import (
class ViewsterIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'
+ _VALID_URL = r'https?://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'
_TESTS = [{
# movie, Type=Movie
'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/',
- 'md5': '14d3cfffe66d57b41ae2d9c873416f01',
+ 'md5': 'e642d1b27fcf3a4ffa79f194f5adde36',
'info_dict': {
'id': '1140-11855-000',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'The listening Project',
'description': 'md5:bac720244afd1a8ea279864e67baa071',
'timestamp': 1214870400,
@@ -33,10 +35,10 @@ class ViewsterIE(InfoExtractor):
}, {
# series episode, Type=Episode
'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/',
- 'md5': 'd5434c80fcfdb61651cc2199a88d6ba3',
+ 'md5': '9243079a8531809efe1b089db102c069',
'info_dict': {
'id': '1284-19427-001',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'The World and a Wall',
'description': 'md5:24814cf74d3453fdf5bfef9716d073e3',
'timestamp': 1428192000,
@@ -61,6 +63,14 @@ class ViewsterIE(InfoExtractor):
'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1',
},
'playlist_mincount': 16,
+ }, {
+ # geo restricted series
+ 'url': 'https://www.viewster.com/serie/1280-18794-002/',
+ 'only_matching': True,
+ }, {
+ # geo restricted video
+ 'url': 'https://www.viewster.com/serie/1280-18794-002/what-is-extraterritoriality-lawo/',
+ 'only_matching': True,
}]
_ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
@@ -74,8 +84,8 @@ class ViewsterIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
# Get 'api_token' cookie
- self._request_webpage(HEADRequest(url), video_id)
- cookies = self._get_cookies(url)
+ self._request_webpage(HEADRequest('http://www.viewster.com/'), video_id)
+ cookies = self._get_cookies('http://www.viewster.com/')
self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value)
info = self._download_json(
@@ -85,10 +95,16 @@ class ViewsterIE(InfoExtractor):
entry_id = info.get('Id') or info['id']
# unfinished serie has no Type
- if info.get('Type') in ['Serie', None]:
- episodes = self._download_json(
- 'https://public-api.viewster.com/series/%s/episodes' % entry_id,
- video_id, 'Downloading series JSON')
+ if info.get('Type') in ('Serie', None):
+ try:
+ episodes = self._download_json(
+ 'https://public-api.viewster.com/series/%s/episodes' % entry_id,
+ video_id, 'Downloading series JSON')
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ self.raise_geo_restricted()
+ else:
+ raise
entries = [
self.url_result(
'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster')
@@ -98,7 +114,7 @@ class ViewsterIE(InfoExtractor):
return self.playlist_result(entries, video_id, title, description)
formats = []
- for media_type in ('application/f4m+xml', 'application/x-mpegURL'):
+ for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):
media = self._download_json(
'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
% (entry_id, compat_urllib_parse.quote(media_type)),
@@ -115,14 +131,28 @@ class ViewsterIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id='hds'))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_formats = self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id='hls',
- fatal=False # m3u8 sometimes fail
- ))
+ fatal=False) # m3u8 sometimes fail
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
else:
- formats.append({
+ format_id = media.get('Bitrate')
+ f = {
'url': video_url,
- })
+ 'format_id': 'mp4-%s' % format_id,
+ 'height': int_or_none(media.get('Height')),
+ 'width': int_or_none(media.get('Width')),
+ 'preference': 1,
+ }
+ if format_id and not f['height']:
+ f['height'] = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append(f)
+
+ if not formats and not info.get('LanguageSets') and not info.get('VODSettings'):
+ self.raise_geo_restricted()
+
self._sort_formats(formats)
synopsis = info.get('Synopsis', {})
diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py
new file mode 100644
index 000000000..525e303d4
--- /dev/null
+++ b/youtube_dl/extractor/viidea.py
@@ -0,0 +1,188 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+ compat_str,
+)
+from ..utils import (
+ parse_duration,
+ js_to_json,
+ parse_iso8601,
+)
+
+
+class ViideaIE(InfoExtractor):
+ _VALID_URL = r'''(?x)http://(?:www\.)?(?:
+ videolectures\.net|
+ flexilearn\.viidea\.net|
+ presentations\.ocwconsortium\.org|
+ video\.travel-zoom\.si|
+ video\.pomp-forum\.si|
+ tv\.nil\.si|
+ video\.hekovnik.com|
+ video\.szko\.si|
+ kpk\.viidea\.com|
+ inside\.viidea\.net|
+ video\.kiberpipa\.org|
+ bvvideo\.si|
+ kongres\.viidea\.net|
+ edemokracija\.viidea\.com
+ )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$'''
+
+ _TESTS = [{
+ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
+ 'info_dict': {
+ 'id': '20171',
+ 'display_id': 'promogram_igor_mekjavic_eng',
+ 'ext': 'mp4',
+ 'title': 'Automatics, robotics and biocybernetics',
+ 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1372349289,
+ 'upload_date': '20130627',
+ 'duration': 565,
+ },
+ }, {
+ # video with invalid direct format links (HTTP 403)
+ 'url': 'http://videolectures.net/russir2010_filippova_nlp/',
+ 'info_dict': {
+ 'id': '14891',
+ 'display_id': 'russir2010_filippova_nlp',
+ 'ext': 'flv',
+ 'title': 'NLP at Google',
+ 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1284375600,
+ 'upload_date': '20100913',
+ 'duration': 5352,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # event playlist
+ 'url': 'http://videolectures.net/deeplearning2015_montreal/',
+ 'info_dict': {
+ 'id': '23181',
+ 'title': 'Deep Learning Summer School, Montreal 2015',
+ 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1438560000,
+ },
+ 'playlist_count': 30,
+ }, {
+ # multi part lecture
+ 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
+ 'info_dict': {
+ 'id': '9737',
+ 'display_id': 'mlss09uk_bishop_ibi',
+ 'title': 'Introduction To Bayesian Inference',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1251622800,
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '9737_part1',
+ 'display_id': 'mlss09uk_bishop_ibi_part1',
+ 'ext': 'wmv',
+ 'title': 'Introduction To Bayesian Inference (Part 1)',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'duration': 4622,
+ 'timestamp': 1251622800,
+ 'upload_date': '20090830',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '9737_part2',
+ 'display_id': 'mlss09uk_bishop_ibi_part2',
+ 'ext': 'wmv',
+ 'title': 'Introduction To Bayesian Inference (Part 2)',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'duration': 5641,
+ 'timestamp': 1251622800,
+ 'upload_date': '20090830',
+ },
+ }],
+ 'playlist_count': 2,
+ }]
+
+ def _real_extract(self, url):
+ lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups()
+
+ webpage = self._download_webpage(url, lecture_slug)
+
+ cfg = self._parse_json(self._search_regex(
+ [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function',
+ r'cfg\s*:\s*({[^}]+})'],
+ webpage, 'cfg'), lecture_slug, js_to_json)
+
+ lecture_id = compat_str(cfg['obj_id'])
+
+ base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
+
+ lecture_data = self._download_json(
+ '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
+ lecture_id)['lecture'][0]
+
+ lecture_info = {
+ 'id': lecture_id,
+ 'display_id': lecture_slug,
+ 'title': lecture_data['title'],
+ 'timestamp': parse_iso8601(lecture_data.get('time')),
+ 'description': lecture_data.get('description_wiki'),
+ 'thumbnail': lecture_data.get('thumb'),
+ }
+
+ playlist_entries = []
+ lecture_type = lecture_data.get('type')
+ parts = [compat_str(video) for video in cfg.get('videos', [])]
+ if parts:
+ multipart = len(parts) > 1
+
+ def extract_part(part_id):
+ smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id)
+ smil = self._download_smil(smil_url, lecture_id)
+ info = self._parse_smil(smil, smil_url, lecture_id)
+ info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id)
+ info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id)
+ if multipart:
+ info['title'] += ' (Part %s)' % part_id
+ switch = smil.find('.//switch')
+ if switch is not None:
+ info['duration'] = parse_duration(switch.attrib.get('dur'))
+ item_info = lecture_info.copy()
+ item_info.update(info)
+ return item_info
+
+ if explicit_part_id or not multipart:
+ result = extract_part(explicit_part_id or parts[0])
+ else:
+ result = {
+ '_type': 'multi_video',
+ 'entries': [extract_part(part) for part in parts],
+ }
+ result.update(lecture_info)
+
+ # Immediately return explicitly requested part or non event item
+ if explicit_part_id or lecture_type != 'evt':
+ return result
+
+ playlist_entries.append(result)
+
+ # It's probably a playlist
+ if not parts or lecture_type == 'evt':
+ playlist_webpage = self._download_webpage(
+ '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
+ entries = [
+ self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
+ for _, video_url in re.findall(
+ r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
+ playlist_entries.extend(entries)
+
+ playlist = self.playlist_result(playlist_entries, lecture_id)
+ playlist.update(lecture_info)
+ return playlist
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 50df79ca1..b72341a2b 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -8,11 +8,11 @@ import itertools
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
- compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
InAdvancePagedList,
int_or_none,
@@ -39,23 +39,31 @@ class VimeoBaseInfoExtractor(InfoExtractor):
return
self.report_login()
webpage = self._download_webpage(self._LOGIN_URL, None, False)
- token = self._extract_xsrft(webpage)
- data = urlencode_postdata({
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ data = urlencode_postdata(encode_dict({
'action': 'login',
'email': username,
'password': password,
'service': 'vimeo',
'token': token,
- })
+ }))
login_request = compat_urllib_request.Request(self._LOGIN_URL, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_request.add_header('Referer', self._LOGIN_URL)
+ self._set_vimeo_cookie('vuid', vuid)
self._download_webpage(login_request, None, False, 'Wrong login info')
- def _extract_xsrft(self, webpage):
- return self._search_regex(
+ def _extract_xsrft_and_vuid(self, webpage):
+ xsrft = self._search_regex(
r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
webpage, 'login token', group='xsrft')
+ vuid = self._search_regex(
+ r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1',
+ webpage, 'vuid', group='vuid')
+ return xsrft, vuid
+
+ def _set_vimeo_cookie(self, name, value):
+ self._set_cookie('vimeo.com', name, value)
class VimeoIE(VimeoBaseInfoExtractor):
@@ -80,12 +88,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '56015672',
'ext': 'mp4',
- "upload_date": "20121220",
- "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- "uploader_id": "user7108434",
- "uploader": "Filippo Valsorda",
- "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- "duration": 10,
+ 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'description': 'md5:2d3305bad981a06ff79f027f19865021',
+ 'upload_date': '20121220',
+ 'uploader_id': 'user7108434',
+ 'uploader': 'Filippo Valsorda',
+ 'duration': 10,
},
},
{
@@ -98,7 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
- 'description': 'md5:380943ec71b89736ff4bf27183233d09',
+ 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30',
'duration': 1595,
},
},
@@ -128,7 +136,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
- 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.',
+ 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026',
},
'params': {
'videopassword': 'youtube-dl',
@@ -152,7 +160,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
{
'url': 'http://vimeo.com/76979871',
- 'md5': '3363dd6ffebe3784d56f4132317fd446',
'note': 'Video with subtitles',
'info_dict': {
'id': '76979871',
@@ -177,6 +184,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'user28849593',
},
},
+ {
+ 'url': 'https://vimeo.com/109815029',
+ 'note': 'Video not completely processed, "failed" seed status',
+ 'only_matching': True,
+ },
]
@staticmethod
@@ -198,17 +210,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
- token = self._extract_xsrft(webpage)
- data = urlencode_postdata({
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ data = urlencode_postdata(encode_dict({
'password': password,
'token': token,
- })
+ }))
if url.startswith('http://'):
# vimeo only supports https now, but the user can give an http url
url = url.replace('http://', 'https://')
password_request = compat_urllib_request.Request(url + '/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Referer', url)
+ self._set_vimeo_cookie('vuid', vuid)
return self._download_webpage(
password_request, video_id,
'Verifying the password', 'Wrong password')
@@ -217,7 +230,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
- data = compat_urllib_parse.urlencode({'password': password})
+ data = urlencode_postdata(encode_dict({'password': password}))
pass_url = url + '/check-password'
password_request = compat_urllib_request.Request(pass_url, data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -268,20 +281,30 @@ class VimeoIE(VimeoBaseInfoExtractor):
self.report_extraction(video_id)
vimeo_config = self._search_regex(
- r'vimeo\.config\s*=\s*({.+?});', webpage,
+ r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', webpage,
'vimeo config', default=None)
if vimeo_config:
seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {})
if seed_status.get('state') == 'failed':
raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, seed_status['title']),
+ '%s said: %s' % (self.IE_NAME, seed_status['title']),
expected=True)
# Extract the config JSON
try:
try:
config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage, 'config URL')
+ r' data-config-url="(.+?)"', webpage,
+ 'config URL', default=None)
+ if not config_url:
+ # Sometimes new react-based page is served instead of old one that require
+ # different config URL extraction approach (see
+ # https://github.com/rg3/youtube-dl/pull/7209)
+ vimeo_clip_page_config = self._search_regex(
+ r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage,
+ 'vimeo clip page config')
+ config_url = self._parse_json(
+ vimeo_clip_page_config, video_id)['player']['config_url']
config_json = self._download_webpage(config_url, video_id)
config = json.loads(config_json)
except RegexNotFoundError:
@@ -364,41 +387,29 @@ class VimeoIE(VimeoBaseInfoExtractor):
like_count = None
comment_count = None
- # Vimeo specific: extract request signature and timestamp
- sig = config['request']['signature']
- timestamp = config['request']['timestamp']
-
- # Vimeo specific: extract video codec and quality information
- # First consider quality, then codecs, then take everything
- codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')]
- files = {'hd': [], 'sd': [], 'other': []}
- config_files = config["video"].get("files") or config["request"].get("files")
- for codec_name, codec_extension in codecs:
- for quality in config_files.get(codec_name, []):
- format_id = '-'.join((codec_name, quality)).lower()
- key = quality if quality in files else 'other'
- video_url = None
- if isinstance(config_files[codec_name], dict):
- file_info = config_files[codec_name][quality]
- video_url = file_info.get('url')
- else:
- file_info = {}
- if video_url is None:
- video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
- % (video_id, sig, timestamp, quality, codec_name.upper())
-
- files[key].append({
- 'ext': codec_extension,
- 'url': video_url,
- 'format_id': format_id,
- 'width': file_info.get('width'),
- 'height': file_info.get('height'),
- })
formats = []
- for key in ('other', 'sd', 'hd'):
- formats += files[key]
- if len(formats) == 0:
- raise ExtractorError('No known codec found')
+ config_files = config['video'].get('files') or config['request'].get('files', {})
+ for f in config_files.get('progressive', []):
+ video_url = f.get('url')
+ if not video_url:
+ continue
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'http-%s' % f.get('quality'),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'fps': int_or_none(f.get('fps')),
+ 'tbr': int_or_none(f.get('bitrate')),
+ })
+ m3u8_url = config_files.get('hls', {}).get('url')
+ if m3u8_url:
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+ # at the same time without actual units specified. This lead to wrong sorting.
+ self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id'))
subtitles = {}
text_tracks = config['request'].get('text_tracks')
@@ -459,16 +470,17 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
if password is None:
raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
fields = self._hidden_inputs(login_form)
- token = self._extract_xsrft(webpage)
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
fields['token'] = token
fields['password'] = password
- post = urlencode_postdata(fields)
+ post = urlencode_postdata(encode_dict(fields))
password_path = self._search_regex(
r'action="([^"]+)"', login_form, 'password URL')
password_url = compat_urlparse.urljoin(page_url, password_path)
password_request = compat_urllib_request.Request(password_url, post)
password_request.add_header('Content-type', 'application/x-www-form-urlencoded')
- self._set_cookie('vimeo.com', 'xsrft', token)
+ self._set_vimeo_cookie('vuid', vuid)
+ self._set_vimeo_cookie('xsrft', token)
return self._download_webpage(
password_request, list_id,
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index c733a48fa..cb2a4b0b5 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -1,10 +1,14 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
class VineIE(InfoExtractor):
@@ -17,10 +21,12 @@ class VineIE(InfoExtractor):
'ext': 'mp4',
'title': 'Chicken.',
'alt_title': 'Vine by Jack Dorsey',
- 'description': 'Chicken.',
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
'uploader_id': '76',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://vine.co/v/MYxVapFvz2z',
@@ -29,11 +35,13 @@ class VineIE(InfoExtractor):
'id': 'MYxVapFvz2z',
'ext': 'mp4',
'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
- 'alt_title': 'Vine by Luna',
- 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
+ 'alt_title': 'Vine by Mars Ruiz',
'upload_date': '20140815',
- 'uploader': 'Luna',
+ 'uploader': 'Mars Ruiz',
'uploader_id': '1102363502380728320',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://vine.co/v/bxVjBbZlPUH',
@@ -43,14 +51,33 @@ class VineIE(InfoExtractor):
'ext': 'mp4',
'title': '#mw3 #ac130 #killcam #angelofdeath',
'alt_title': 'Vine by Z3k3',
- 'description': '#mw3 #ac130 #killcam #angelofdeath',
'upload_date': '20130430',
'uploader': 'Z3k3',
'uploader_id': '936470460173008896',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://vine.co/oembed/MYxVapFvz2z.json',
'only_matching': True,
+ }, {
+ 'url': 'https://vine.co/v/e192BnZnZ9V',
+ 'info_dict': {
+ 'id': 'e192BnZnZ9V',
+ 'ext': 'mp4',
+ 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2',
+ 'alt_title': 'Vine by Pimry_zaa',
+ 'upload_date': '20150705',
+ 'uploader': 'Pimry_zaa',
+ 'uploader_id': '1135760698325307392',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -58,32 +85,33 @@ class VineIE(InfoExtractor):
webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
data = self._parse_json(
- self._html_search_regex(
- r'window\.POST_DATA = { %s: ({.+?}) };\s*</script>' % video_id,
+ self._search_regex(
+ r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id,
webpage, 'vine data'),
video_id)
formats = [{
'format_id': '%(format)s-%(rate)s' % f,
- 'vcodec': f['format'],
- 'quality': f['rate'],
+ 'vcodec': f.get('format'),
+ 'quality': f.get('rate'),
'url': f['videoUrl'],
- } for f in data['videoUrls']]
+ } for f in data['videoUrls'] if f.get('videoUrl')]
self._sort_formats(formats)
+ username = data.get('username')
+
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
- 'alt_title': self._og_search_description(webpage, default=None),
- 'description': data['description'],
- 'thumbnail': data['thumbnailUrl'],
- 'upload_date': unified_strdate(data['created']),
- 'uploader': data['username'],
- 'uploader_id': data['userIdStr'],
- 'like_count': data['likes']['count'],
- 'comment_count': data['comments']['count'],
- 'repost_count': data['reposts']['count'],
+ 'title': data.get('description') or self._og_search_title(webpage),
+ 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None),
+ 'thumbnail': data.get('thumbnailUrl'),
+ 'upload_date': unified_strdate(data.get('created')),
+ 'uploader': username,
+ 'uploader_id': data.get('userIdStr'),
+ 'like_count': int_or_none(data.get('likes', {}).get('count')),
+ 'comment_count': int_or_none(data.get('comments', {}).get('count')),
+ 'repost_count': int_or_none(data.get('reposts', {}).get('count')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index c30c5a8e5..01960b827 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -17,6 +17,7 @@ from ..utils import (
unescapeHTML,
unified_strdate,
)
+from .vimeo import VimeoIE
class VKIE(InfoExtractor):
@@ -249,6 +250,10 @@ class VKIE(InfoExtractor):
if youtube_url:
return self.url_result(youtube_url, 'Youtube')
+ vimeo_url = VimeoIE._extract_vimeo_url(url, info_page)
+ if vimeo_url is not None:
+ return self.url_result(vimeo_url)
+
m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
if m_rutube is not None:
@@ -276,9 +281,13 @@ class VKIE(InfoExtractor):
mobj.group(1) + ' ' + mobj.group(2)
upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
- view_count = str_to_int(self._search_regex(
- r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
- info_page, 'view count', fatal=False))
+ view_count = None
+ views = self._html_search_regex(
+ r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
+ info_page, 'view count', fatal=False)
+ if views:
+ view_count = str_to_int(self._search_regex(
+ r'([\d,.]+)', views, 'view count', fatal=False))
formats = [{
'format_id': k,
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index 2ddf29a69..5a897371d 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -84,6 +84,5 @@ class WSJIE(InfoExtractor):
'duration': duration,
'upload_date': upload_date,
'title': title,
- 'formats': formats,
'categories': categories,
}
diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/xfileshare.py
index d23e3eac1..7610dc627 100644
--- a/youtube_dl/extractor/gorillavid.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -15,11 +15,11 @@ from ..utils import (
)
-class GorillaVidIE(InfoExtractor):
- IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com'
+class XFileShareIE(InfoExtractor):
+ IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com))/
+ (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto.\me))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
@@ -76,6 +76,13 @@ class GorillaVidIE(InfoExtractor):
'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4',
'thumbnail': 're:http://.*\.jpg',
}
+ }, {
+ 'url': 'http://vidto.me/ku5glz52nqe1.html',
+ 'info_dict': {
+ 'id': 'ku5glz52nqe1',
+ 'ext': 'mp4',
+ 'title': 'test'
+ }
}]
def _real_extract(self, url):
@@ -104,13 +111,18 @@ class GorillaVidIE(InfoExtractor):
webpage = self._download_webpage(req, video_id, 'Downloading video page')
- title = self._search_regex(
- [r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'>Watch (.+) '],
- webpage, 'title', default=None) or self._og_search_title(webpage)
+ title = (self._search_regex(
+ [r'style="z-index: [0-9]+;">([^<]+)</span>',
+ r'<td nowrap>([^<]+)</td>',
+ r'>Watch (.+) ',
+ r'<h2 class="video-page-head">([^<]+)</h2>'],
+ webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
video_url = self._search_regex(
- r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url')
+ [r'file\s*:\s*["\'](http[^"\']+)["\'],',
+ r'file_link\s*=\s*\'(https?:\/\/[0-9a-zA-z.\/\-_]+)'],
+ webpage, 'file url')
thumbnail = self._search_regex(
- r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', fatal=False)
+ r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)
formats = [{
'format_id': 'sd',
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 97315750f..8938c0e45 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -63,7 +63,9 @@ class XHamsterIE(InfoExtractor):
mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
- title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
+ title = self._html_search_regex(
+ [r'<title>(?P<title>.+?)(?:, (?:[^,]+? )?Porn: xHamster| - xHamster\.com)</title>',
+ r'<h1>([^<]+)</h1>'], webpage, 'title')
# Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index 4098e4629..08dc81f3a 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -46,6 +46,12 @@ class YandexMusicTrackIE(InfoExtractor):
% (data['host'], key, data['ts'] + data['path'], storage[1]))
def _get_track_info(self, track):
+ thumbnail = None
+ cover_uri = track.get('albums', [{}])[0].get('coverUri')
+ if cover_uri:
+ thumbnail = cover_uri.replace('%%', 'orig')
+ if not thumbnail.startswith('http'):
+ thumbnail = 'http://' + thumbnail
return {
'id': track['id'],
'ext': 'mp3',
@@ -53,6 +59,7 @@ class YandexMusicTrackIE(InfoExtractor):
'title': '%s - %s' % (track['artists'][0]['name'], track['title']),
'filesize': int_or_none(track.get('fileSize')),
'duration': float_or_none(track.get('durationMs'), 1000),
+ 'thumbnail': thumbnail,
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 4ba7c36db..9bf8d1eeb 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -1,121 +1,171 @@
from __future__ import unicode_literals
-
-import json
import re
-import sys
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_request
from ..utils import (
- ExtractorError,
+ int_or_none,
+ str_to_int,
unescapeHTML,
unified_strdate,
)
-from ..aes import (
- aes_decrypt_text
-)
+from ..aes import aes_decrypt_text
class YouPornIE(InfoExtractor):
- _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+ 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
'info_dict': {
'id': '505835',
+ 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily',
'ext': 'mp4',
- 'upload_date': '20101221',
+ 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'Ask Dan And Jennifer',
- 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
+ 'upload_date': '20101221',
+ 'average_rating': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'categories': list,
+ 'tags': list,
'age_limit': 18,
- }
- }
+ },
+ }, {
+ # Anonymous User uploader
+ 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
+ 'info_dict': {
+ 'id': '561726',
+ 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show',
+ 'ext': 'mp4',
+ 'title': 'Big Tits Awesome Brunette On amazing webcam show',
+ 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Anonymous User',
+ 'upload_date': '20111125',
+ 'average_rating': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'categories': list,
+ 'tags': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = mobj.group('proto') + 'www.' + mobj.group('url')
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- req = compat_urllib_request.Request(url)
- req.add_header('Cookie', 'age_verified=1')
- webpage = self._download_webpage(req, video_id)
- age_limit = self._rta_search(webpage)
+ request = compat_urllib_request.Request(url)
+ request.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(request, display_id)
+
+ title = self._search_regex(
+ [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1',
+ r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'],
+ webpage, 'title', group='title')
- # Get JSON parameters
- json_params = self._search_regex(
- [r'videoJa?son\s*=\s*({.+})',
- r'var\s+currentVideo\s*=\s*new\s+Video\((.+?)\)[,;]'],
- webpage, 'JSON parameters')
- try:
- params = json.loads(json_params)
- except ValueError:
- raise ExtractorError('Invalid JSON')
-
- self.report_extraction(video_id)
- try:
- video_title = params['title']
- upload_date = unified_strdate(params['release_date_f'])
- video_description = params['description']
- video_uploader = params['submitted_by']
- thumbnail = params['thumbnails'][0]['image']
- except KeyError:
- raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
-
- # Get all of the links from the page
- DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
- download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
- webpage, 'download list').strip()
- LINK_RE = r'<a href="([^"]+)">'
- links = re.findall(LINK_RE, download_list_html)
-
- # Get all encrypted links
- encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage)
- for encrypted_link in encrypted_links:
- link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')
+ links = []
+
+ sources = self._search_regex(
+ r'sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+ if sources:
+ for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
+ links.append(link)
+
+ # Fallback #1
+ for _, link in re.findall(
+ r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
+ links.append(link)
+
+ # Fallback #2, this also contains extra low quality 180p format
+ for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
links.append(link)
+ # Fallback #3, encrypted links
+ for _, encrypted_link in re.findall(
+ r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
+ links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))
+
formats = []
- for link in links:
- # A link looks like this:
- # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
- # A path looks like this:
- # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
- video_url = unescapeHTML(link)
- path = compat_urllib_parse_urlparse(video_url).path
- format_parts = path.split('/')[4].split('_')[:2]
-
- dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0]
-
- resolution = format_parts[0]
- height = int(resolution[:-len('p')])
- bitrate = int(format_parts[1][:-len('k')])
- format = '-'.join(format_parts) + '-' + dn
-
- formats.append({
+ for video_url in set(unescapeHTML(link) for link in links):
+ f = {
'url': video_url,
- 'format': format,
- 'format_id': format,
- 'height': height,
- 'tbr': bitrate,
- 'resolution': resolution,
- })
-
+ }
+ # Video URL's path looks like this:
+ # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+ # We will benefit from it by extracting some metadata
+ mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+ if mobj:
+ height = int(mobj.group('height'))
+ bitrate = int(mobj.group('bitrate'))
+ f.update({
+ 'format_id': '%dp-%dk' % (height, bitrate),
+ 'height': height,
+ 'tbr': bitrate,
+ })
+ formats.append(f)
self._sort_formats(formats)
- if not formats:
- raise ExtractorError('ERROR: no known formats available for video')
+ description = self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>',
+ webpage, 'description', default=None)
+ thumbnail = self._search_regex(
+ r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
+ webpage, 'thumbnail', fatal=False, group='thumbnail')
+
+ uploader = self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>',
+ webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>',
+ webpage, 'upload date', fatal=False))
+
+ age_limit = self._rta_search(webpage)
+
+ average_rating = int_or_none(self._search_regex(
+ r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
+ webpage, 'average rating', fatal=False))
+
+ view_count = str_to_int(self._search_regex(
+ r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>',
+ webpage, 'view count', fatal=False))
+ comment_count = str_to_int(self._search_regex(
+ r'>All [Cc]omments? \(([\d,.]+)\)',
+ webpage, 'comment count', fatal=False))
+
+ def extract_tag_box(title):
+ tag_box = self._search_regex(
+ (r'<div[^>]+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?</div>\s*'
+ '<div[^>]+class=["\']tagBoxContent["\']>(.+?)</div>') % re.escape(title),
+ webpage, '%s tag box' % title, default=None)
+ if not tag_box:
+ return []
+ return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box)
+
+ categories = extract_tag_box('Category')
+ tags = extract_tag_box('Tags')
return {
'id': video_id,
- 'uploader': video_uploader,
- 'upload_date': upload_date,
- 'title': video_title,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
'thumbnail': thumbnail,
- 'description': video_description,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'average_rating': average_rating,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'categories': categories,
+ 'tags': tags,
'age_limit': age_limit,
'formats': formats,
}
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 97ce36550..687e0b4db 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return
+class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
+ # Extract the video ids from the playlist pages
+ def _entries(self, page, playlist_id):
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ for video_id, video_title in self.extract_videos_from_page(content_html):
+ yield self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
+
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+ for mobj in re.finditer(self._VIDEO_RE, page):
+ # The link with index 0 is not the first video of the playlist (not sure if still actual)
+ if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+ continue
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ if video_title:
+ video_title = video_title.strip()
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
+
+
class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
@@ -657,7 +703,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -1061,6 +1107,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not video_info:
video_info = get_video_info
if 'token' in get_video_info:
+ # Different get_video_info requests may report different results, e.g.
+ # some may report video unavailability, but some may serve it without
+ # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
+ # the original webpage as well as el=info and el=embedded get_video_info
+ # requests report video unavailability due to geo restriction while
+ # el=detailpage succeeds and returns valid data). This is probably
+ # due to YouTube measures against IP ranges of hosting providers.
+ # Working around by preferring the first succeeded video_info containing
+ # the token if no such video_info yet was found.
+ if 'token' not in video_info:
+ video_info = get_video_info
break
if 'token' not in video_info:
if 'reason' in video_info:
@@ -1286,7 +1343,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1419,7 +1476,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
-class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
(?:https?://)?
@@ -1440,7 +1497,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -1557,37 +1614,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
else:
self.report_warning('Youtube gives an alert message: ' + match)
- # Extract the video ids from the playlist pages
- def _entries():
- more_widget_html = content_html = page
- for page_num in itertools.count(1):
- matches = re.finditer(self._VIDEO_RE, content_html)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- for vid_id in new_ids:
- yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
-
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
-
playlist_title = self._html_search_regex(
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
page, 'title')
- return self.playlist_result(_entries(), playlist_id, playlist_title)
+ return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
def _real_extract(self, url):
# Extract playlist id
@@ -1613,36 +1644,31 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self._extract_playlist(playlist_id)
-class YoutubeChannelIE(InfoExtractor):
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com channels'
_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+ _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
IE_NAME = 'youtube:channel'
_TESTS = [{
'note': 'paginated channel',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'playlist_mincount': 91,
'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'Uploads from lex will',
}
+ }, {
+ 'note': 'Age restricted channel',
+ # from https://www.youtube.com/user/DeusExOfficial
+ 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
+ 'playlist_mincount': 64,
+ 'info_dict': {
+ 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
+ 'title': 'Uploads from Deus Ex',
+ },
}]
- @staticmethod
- def extract_videos_from_page(page):
- ids_in_page = []
- titles_in_page = []
- for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
- video_id = mobj.group('id')
- video_title = unescapeHTML(mobj.group('title'))
- try:
- idx = ids_in_page.index(video_id)
- if video_title and not titles_in_page[idx]:
- titles_in_page[idx] = video_title
- except ValueError:
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
- return zip(ids_in_page, titles_in_page)
-
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -1654,12 +1680,15 @@ class YoutubeChannelIE(InfoExtractor):
channel_page = self._download_webpage(
url + '?view=57', channel_id,
'Downloading channel page', fatal=False)
- channel_playlist_id = self._html_search_meta(
- 'channelId', channel_page, 'channel id', default=None)
- if not channel_playlist_id:
- channel_playlist_id = self._search_regex(
- r'data-channel-external-id="([^"]+)"',
- channel_page, 'channel id', default=None)
+ if channel_page is False:
+ channel_playlist_id = False
+ else:
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_playlist_id = self._search_regex(
+ r'data-(?:channel-external-|yt)id="([^"]+)"',
+ channel_page, 'channel id', default=None)
if channel_playlist_id and channel_playlist_id.startswith('UC'):
playlist_id = 'UU' + channel_playlist_id[2:]
return self.url_result(
@@ -1682,29 +1711,7 @@ class YoutubeChannelIE(InfoExtractor):
for video_id, video_title in self.extract_videos_from_page(channel_page)]
return self.playlist_result(entries, channel_id)
- def _entries():
- more_widget_html = content_html = channel_page
- for pagenum in itertools.count(1):
-
- for video_id, video_title in self.extract_videos_from_page(content_html):
- yield self.url_result(
- video_id, 'Youtube', video_id=video_id,
- video_title=video_title)
-
- mobj = re.search(
- r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
- more_widget_html)
- if not mobj:
- break
-
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), channel_id,
- 'Downloading page #%s' % (pagenum + 1),
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
-
- return self.playlist_result(_entries(), channel_id)
+ return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
class YoutubeUserIE(YoutubeChannelIE):
@@ -1970,6 +1977,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
annotation_id=annotation_[^&]+|
x-yt-cl=[0-9]+|
hl=[^&]*|
+ t=[0-9]+
)?
|
attribution_link\?a=[^&]+
@@ -1992,6 +2000,9 @@ class YoutubeTruncatedURLIE(InfoExtractor):
}, {
'url': 'https://www.youtube.com/watch?hl=en-GB',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?t=2372',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 98f15177b..a795f56b3 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -9,6 +9,7 @@ from ..utils import (
int_or_none,
unified_strdate,
OnDemandPagedList,
+ xpath_text,
)
@@ -19,13 +20,11 @@ def extract_from_xml_url(ie, video_id, xml_url):
errnote='Failed to download video info')
title = doc.find('.//information/title').text
- description = doc.find('.//information/detail').text
- duration = int(doc.find('.//details/lengthSec').text)
- uploader_node = doc.find('.//details/originChannelTitle')
- uploader = None if uploader_node is None else uploader_node.text
- uploader_id_node = doc.find('.//details/originChannelId')
- uploader_id = None if uploader_id_node is None else uploader_id_node.text
- upload_date = unified_strdate(doc.find('.//details/airtime').text)
+ description = xpath_text(doc, './/information/detail', 'description')
+ duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration'))
+ uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader')
+ uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id')
+ upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date'))
def xml_to_format(fnode):
video_url = fnode.find('url').text
@@ -40,15 +39,14 @@ def extract_from_xml_url(ie, video_id, xml_url):
ext = format_m.group('container')
proto = format_m.group('proto').lower()
- quality = fnode.find('./quality').text
- abr = int(fnode.find('./audioBitrate').text) // 1000
- vbr_node = fnode.find('./videoBitrate')
- vbr = None if vbr_node is None else int(vbr_node.text) // 1000
+ quality = xpath_text(fnode, './quality', 'quality')
+ abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000)
+ vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000)
- width_node = fnode.find('./width')
- width = None if width_node is None else int_or_none(width_node.text)
- height_node = fnode.find('./height')
- height = None if height_node is None else int_or_none(height_node.text)
+ width = int_or_none(xpath_text(fnode, './width', 'width'))
+ height = int_or_none(xpath_text(fnode, './height', 'height'))
+
+ filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize'))
format_note = ''
if not format_note:
@@ -64,12 +62,31 @@ def extract_from_xml_url(ie, video_id, xml_url):
'vbr': vbr,
'width': width,
'height': height,
- 'filesize': int_or_none(fnode.find('./filesize').text),
+ 'filesize': filesize,
'format_note': format_note,
'protocol': proto,
'_available': is_available,
}
+ def xml_to_thumbnails(fnode):
+ thumbnails = []
+ for node in fnode:
+ thumbnail_url = node.text
+ if not thumbnail_url:
+ continue
+ thumbnail = {
+ 'url': thumbnail_url,
+ }
+ if 'key' in node.attrib:
+ m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key'])
+ if m:
+ thumbnail['width'] = int(m.group(1))
+ thumbnail['height'] = int(m.group(2))
+ thumbnails.append(thumbnail)
+ return thumbnails
+
+ thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage'))
+
format_nodes = doc.findall('.//formitaeten/formitaet')
formats = list(filter(
lambda f: f['_available'],
@@ -81,6 +98,7 @@ def extract_from_xml_url(ie, video_id, xml_url):
'title': title,
'description': description,
'duration': duration,
+ 'thumbnails': thumbnails,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py
index 7dc1e2f2b..437eecb67 100644
--- a/youtube_dl/extractor/zingmp3.py
+++ b/youtube_dl/extractor/zingmp3.py
@@ -9,9 +9,11 @@ from ..utils import ExtractorError
class ZingMp3BaseInfoExtractor(InfoExtractor):
- def _extract_item(self, item):
+ def _extract_item(self, item, fatal=True):
error_message = item.find('./errormessage').text
if error_message:
+ if not fatal:
+ return
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, error_message),
expected=True)
@@ -43,7 +45,9 @@ class ZingMp3BaseInfoExtractor(InfoExtractor):
entries = []
for i, item in enumerate(items, 1):
- entry = self._extract_item(item)
+ entry = self._extract_item(item, fatal=False)
+ if not entry:
+ continue
entry['id'] = '%s-%d' % (id, i)
entries.append(entry)
@@ -85,7 +89,7 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor):
class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor):
- _VALID_URL = r'https?://mp3\.zing\.vn/album/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html'
+ _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html'
_TESTS = [{
'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
'info_dict': {
@@ -94,6 +98,9 @@ class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor):
'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless',
},
'playlist_count': 10,
+ }, {
+ 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
+ 'only_matching': True,
}]
IE_NAME = 'zingmp3:album'
IE_DESC = 'mp3.zing.vn albums'
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index 0e0c7d90d..9bc855144 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -232,10 +232,10 @@ class JSInterpreter(object):
def extract_function(self, funcname):
func_m = re.search(
r'''(?x)
- (?:function\s+%s|[{;]%s\s*=\s*function)\s*
+ (?:function\s+%s|[{;]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
\((?P<args>[^)]*)\)\s*
\{(?P<code>[^}]+)\}''' % (
- re.escape(funcname), re.escape(funcname)),
+ re.escape(funcname), re.escape(funcname), re.escape(funcname)),
self.code)
if func_m is None:
raise ExtractorError('Could not find JS function %r' % funcname)
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 5eccc0a70..3dd6d290b 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -276,7 +276,7 @@ def parseOpts(overrideArguments=None):
'For example, to only match videos that have been liked more than '
'100 times and disliked less than 50 times (or the dislike '
'functionality is not available at the given service), but who '
- 'also have a description, use --match-filter '
+ 'also have a description, use --match-filter '
'"like_count > 100 & dislike_count <? 50 & description" .'
))
selection.add_option(
@@ -602,7 +602,7 @@ def parseOpts(overrideArguments=None):
filesystem.add_option(
'-A', '--auto-number',
action='store_true', dest='autonumber', default=False,
- help='[deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] Number downloaded files starting from 00000')
+ help='[deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] Number downloaded files starting from 00000')
filesystem.add_option(
'-t', '--title',
action='store_true', dest='usetitle', default=False,
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 1f723908b..5ed723bc6 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -135,7 +135,10 @@ class FFmpegPostProcessor(PostProcessor):
files_cmd = []
for path in input_paths:
- files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)])
+ files_cmd.extend([
+ encodeArgument('-i'),
+ encodeFilename(self._ffmpeg_filename_argument(path), True)
+ ])
cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] +
files_cmd +
[encodeArgument(o) for o in opts] +
@@ -155,10 +158,10 @@ class FFmpegPostProcessor(PostProcessor):
self.run_ffmpeg_multiple_files([path], out_path, opts)
def _ffmpeg_filename_argument(self, fn):
- # ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details
- if fn.startswith('-'):
- return './' + fn
- return fn
+ # Always use 'file:' because the filename may contain ':' (ffmpeg
+ # interprets that as a protocol) or can start with '-' (-- is broken in
+ # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details)
+ return 'file:' + fn
class FFmpegExtractAudioPP(FFmpegPostProcessor):
@@ -269,7 +272,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
return [], information
try:
- self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path)
+ self._downloader.to_screen('[ffmpeg] Destination: ' + new_path)
self.run_ffmpeg(path, new_path, acodec, more_opts)
except AudioConversionError as e:
raise PostProcessingError(
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 518cea98b..65556d056 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -3,6 +3,7 @@
from __future__ import unicode_literals
+import base64
import calendar
import codecs
import contextlib
@@ -35,6 +36,7 @@ import zlib
from .compat import (
compat_basestring,
compat_chr,
+ compat_etree_fromstring,
compat_html_entities,
compat_http_client,
compat_kwargs,
@@ -177,10 +179,19 @@ def xpath_with_ns(path, ns_map):
def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
- if sys.version_info < (2, 7): # Crazy 2.6
- xpath = xpath.encode('ascii')
+ def _find_xpath(xpath):
+ if sys.version_info < (2, 7): # Crazy 2.6
+ xpath = xpath.encode('ascii')
+ return node.find(xpath)
+
+ if isinstance(xpath, (str, compat_str)):
+ n = _find_xpath(xpath)
+ else:
+ for xp in xpath:
+ n = _find_xpath(xp)
+ if n is not None:
+ break
- n = node.find(xpath)
if n is None:
if default is not NO_DEFAULT:
return default
@@ -364,7 +375,7 @@ def sanitize_path(s):
if drive_or_unc:
norm_path.pop(0)
sanitized_path = [
- path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
+ path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
for path_part in norm_path]
if drive_or_unc:
sanitized_path.insert(0, drive_or_unc + os.path.sep)
@@ -628,7 +639,7 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
# expected HTTP responses to meet HTTP/1.0 or later (see also
# https://github.com/rg3/youtube-dl/issues/6727)
if sys.version_info < (3, 0):
- kwargs['strict'] = True
+ kwargs[b'strict'] = True
hc = http_class(*args, **kwargs)
source_address = ydl_handler._params.get('source_address')
if source_address is not None:
@@ -822,9 +833,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
if date_str is None:
return None
+ date_str = re.sub(r'\.[0-9]+', '', date_str)
+
if timezone is None:
m = re.search(
- r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+ r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
date_str)
if not m:
timezone = datetime.timedelta()
@@ -837,9 +850,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
timezone = datetime.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
- date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
- dt = datetime.datetime.strptime(date_str, date_format) - timezone
- return calendar.timegm(dt.timetuple())
+ try:
+ date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+ dt = datetime.datetime.strptime(date_str, date_format) - timezone
+ return calendar.timegm(dt.timetuple())
+ except ValueError:
+ pass
def unified_strdate(date_str, day_first=True):
@@ -904,7 +920,8 @@ def unified_strdate(date_str, day_first=True):
timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
- return upload_date
+ if upload_date is not None:
+ return compat_str(upload_date)
def determine_ext(url, default_ext='unknown_video'):
@@ -1380,7 +1397,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
v = getattr(v, get_attr, None)
if v == '':
v = None
- return default if v is None else (int(v) * invscale // scale)
+ if v is None:
+ return default
+ try:
+ return int(v) * invscale // scale
+ except ValueError:
+ return default
def str_or_none(v, default=None):
@@ -1396,7 +1418,12 @@ def str_to_int(int_str):
def float_or_none(v, scale=1, invscale=1, default=None):
- return default if v is None else (float(v) * invscale / scale)
+ if v is None:
+ return default
+ try:
+ return float(v) * invscale / scale
+ except ValueError:
+ return default
def parse_duration(s):
@@ -1649,29 +1676,6 @@ def encode_dict(d, encoding='utf-8'):
return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
-try:
- etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError: # Python <=2.6
- etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
- class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
- def doctype(self, name, pubid, system):
- pass # Ignore doctypes
-
- parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
- kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
- tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
- # Fix up XML parser in Python 2.x
- if sys.version_info < (3, 0):
- for n in etree_iter(tree):
- if n.text is not None:
- if not isinstance(n.text, compat_str):
- n.text = n.text.decode('utf-8')
- return tree
-
-
US_RATINGS = {
'G': 0,
'PG': 10,
@@ -1699,8 +1703,8 @@ def js_to_json(code):
if v in ('true', 'false', 'null'):
return v
if v.startswith('"'):
- return v
- if v.startswith("'"):
+ v = re.sub(r"\\'", "'", v[1:-1])
+ elif v.startswith("'"):
v = v[1:-1]
v = re.sub(r"\\\\|\\'|\"", lambda m: {
'\\\\': '\\\\',
@@ -1794,6 +1798,10 @@ def urlhandle_detect_ext(url_handle):
return mimetype2ext(getheader('Content-Type'))
+def encode_data_uri(data, mime_type):
+ return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
def age_restricted(content_limit, age_limit):
""" Returns True iff the content should be blocked """
@@ -1968,7 +1976,7 @@ def dfxp2srt(dfxp_data):
return out
- dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+ dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 0cc7411f2..6585d60d5 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2015.09.09'
+__version__ = '2015.11.13'