aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py76
-rw-r--r--youtube_dl/__init__.py1
-rw-r--r--youtube_dl/compat.py8
-rw-r--r--youtube_dl/downloader/common.py3
-rw-r--r--youtube_dl/downloader/fragment.py3
-rw-r--r--youtube_dl/extractor/__init__.py42
-rw-r--r--youtube_dl/extractor/aol.py70
-rw-r--r--youtube_dl/extractor/appletrailers.py5
-rw-r--r--youtube_dl/extractor/arte.py54
-rw-r--r--youtube_dl/extractor/audimedia.py19
-rw-r--r--youtube_dl/extractor/audioboom.py66
-rw-r--r--youtube_dl/extractor/bbc.py30
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/bokecc.py60
-rw-r--r--youtube_dl/extractor/c56.py22
-rw-r--r--youtube_dl/extractor/cinemassacre.py24
-rw-r--r--youtube_dl/extractor/cnet.py6
-rw-r--r--youtube_dl/extractor/common.py139
-rw-r--r--youtube_dl/extractor/douyutv.py23
-rw-r--r--youtube_dl/extractor/dplay.py123
-rw-r--r--youtube_dl/extractor/dw.py85
-rw-r--r--youtube_dl/extractor/elpais.py31
-rw-r--r--youtube_dl/extractor/engadget.py25
-rw-r--r--youtube_dl/extractor/facebook.py109
-rw-r--r--youtube_dl/extractor/faz.py2
-rw-r--r--youtube_dl/extractor/fivemin.py51
-rw-r--r--youtube_dl/extractor/foxnews.py4
-rw-r--r--youtube_dl/extractor/freespeech.py2
-rw-r--r--youtube_dl/extractor/generic.py50
-rw-r--r--youtube_dl/extractor/googledrive.py14
-rw-r--r--youtube_dl/extractor/imdb.py2
-rw-r--r--youtube_dl/extractor/indavideo.py2
-rw-r--r--youtube_dl/extractor/infoq.py31
-rw-r--r--youtube_dl/extractor/iqiyi.py262
-rw-r--r--youtube_dl/extractor/jeuxvideo.py2
-rw-r--r--youtube_dl/extractor/jwplatform.py63
-rw-r--r--youtube_dl/extractor/kaltura.py67
-rw-r--r--youtube_dl/extractor/khanacademy.py4
-rw-r--r--youtube_dl/extractor/kusi.py99
-rw-r--r--youtube_dl/extractor/kuwo.py1
-rw-r--r--youtube_dl/extractor/leeco.py (renamed from youtube_dl/extractor/letv.py)111
-rw-r--r--youtube_dl/extractor/lifenews.py101
-rw-r--r--youtube_dl/extractor/livestream.py8
-rw-r--r--youtube_dl/extractor/makerschannel.py40
-rw-r--r--youtube_dl/extractor/mdr.py9
-rw-r--r--youtube_dl/extractor/minoto.py56
-rw-r--r--youtube_dl/extractor/mit.py2
-rw-r--r--youtube_dl/extractor/mixcloud.py5
-rw-r--r--youtube_dl/extractor/motherless.py94
-rw-r--r--youtube_dl/extractor/mtv.py5
-rw-r--r--youtube_dl/extractor/nba.py101
-rw-r--r--youtube_dl/extractor/nrk.py38
-rw-r--r--youtube_dl/extractor/pbs.py72
-rw-r--r--youtube_dl/extractor/pyvideo.py6
-rw-r--r--youtube_dl/extractor/revision3.py97
-rw-r--r--youtube_dl/extractor/rtve.py30
-rw-r--r--youtube_dl/extractor/safari.py64
-rw-r--r--youtube_dl/extractor/screenwavemedia.py18
-rw-r--r--youtube_dl/extractor/sexu.py25
-rw-r--r--youtube_dl/extractor/space.py38
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/tf1.py6
-rw-r--r--youtube_dl/extractor/theplatform.py18
-rw-r--r--youtube_dl/extractor/tnaflix.py34
-rw-r--r--youtube_dl/extractor/twitch.py26
-rw-r--r--youtube_dl/extractor/twitter.py259
-rw-r--r--youtube_dl/extractor/ustudio.py67
-rw-r--r--youtube_dl/extractor/vgtv.py17
-rw-r--r--youtube_dl/extractor/vice.py78
-rw-r--r--youtube_dl/extractor/videomega.py10
-rw-r--r--youtube_dl/extractor/vidzi.py27
-rw-r--r--youtube_dl/extractor/viki.py6
-rw-r--r--youtube_dl/extractor/vimeo.py82
-rw-r--r--youtube_dl/extractor/vk.py35
-rw-r--r--youtube_dl/extractor/wat.py10
-rw-r--r--youtube_dl/extractor/webofstories.py64
-rw-r--r--youtube_dl/extractor/wimp.py2
-rw-r--r--youtube_dl/extractor/wistia.py3
-rw-r--r--youtube_dl/extractor/xfileshare.py10
-rw-r--r--youtube_dl/extractor/yandexmusic.py18
-rw-r--r--youtube_dl/extractor/youporn.py5
-rw-r--r--youtube_dl/extractor/youtube.py215
-rw-r--r--youtube_dl/extractor/zdf.py4
-rw-r--r--youtube_dl/options.py8
-rw-r--r--youtube_dl/postprocessor/__init__.py2
-rw-r--r--youtube_dl/postprocessor/embedthumbnail.py2
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py19
-rw-r--r--youtube_dl/postprocessor/xattrpp.py5
-rw-r--r--youtube_dl/utils.py164
-rw-r--r--youtube_dl/version.py2
90 files changed, 2887 insertions, 923 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index f4324039c..8c651cd52 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -24,9 +24,6 @@ import time
import tokenize
import traceback
-if os.name == 'nt':
- import ctypes
-
from .compat import (
compat_basestring,
compat_cookiejar,
@@ -34,6 +31,7 @@ from .compat import (
compat_get_terminal_size,
compat_http_client,
compat_kwargs,
+ compat_os_name,
compat_str,
compat_tokenize_tokenize,
compat_urllib_error,
@@ -87,6 +85,7 @@ from .extractor import get_info_extractor, gen_extractors
from .downloader import get_suitable_downloader
from .downloader.rtmp import rtmpdump_version
from .postprocessor import (
+ FFmpegFixupM3u8PP,
FFmpegFixupM4aPP,
FFmpegFixupStretchedPP,
FFmpegMergerPP,
@@ -95,6 +94,9 @@ from .postprocessor import (
)
from .version import __version__
+if compat_os_name == 'nt':
+ import ctypes
+
class YoutubeDL(object):
"""YoutubeDL class.
@@ -450,7 +452,7 @@ class YoutubeDL(object):
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
return
- if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
+ if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
# c_wchar_p() might not be necessary if `message` is
# already of type unicode()
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
@@ -521,7 +523,7 @@ class YoutubeDL(object):
else:
if self.params.get('no_warnings'):
return
- if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
+ if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
else:
_msg_header = 'WARNING:'
@@ -533,7 +535,7 @@ class YoutubeDL(object):
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
'''
- if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
+ if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
_msg_header = '\033[0;31mERROR:\033[0m'
else:
_msg_header = 'ERROR:'
@@ -566,7 +568,7 @@ class YoutubeDL(object):
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
- template_dict['resolution'] = '?x%d' % template_dict['width']
+ template_dict['resolution'] = '%dx?' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
@@ -1232,6 +1234,10 @@ class YoutubeDL(object):
if t.get('id') is None:
t['id'] = '%d' % i
+ if self.params.get('list_thumbnails'):
+ self.list_thumbnails(info_dict)
+ return
+
if thumbnails and 'thumbnail' not in info_dict:
info_dict['thumbnail'] = thumbnails[-1]['url']
@@ -1333,9 +1339,6 @@ class YoutubeDL(object):
if self.params.get('listformats'):
self.list_formats(info_dict)
return
- if self.params.get('list_thumbnails'):
- self.list_thumbnails(info_dict)
- return
req_format = self.params.get('format')
if req_format is None:
@@ -1631,12 +1634,14 @@ class YoutubeDL(object):
self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
return
- if success:
+ if success and filename != '-':
# Fixup content
fixup_policy = self.params.get('fixup')
if fixup_policy is None:
fixup_policy = 'detect_or_warn'
+ INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
+
stretched_ratio = info_dict.get('stretched_ratio')
if stretched_ratio is not None and stretched_ratio != 1:
if fixup_policy == 'warn':
@@ -1649,15 +1654,18 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(stretched_pp)
else:
self.report_warning(
- '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
- info_dict['id'], stretched_ratio))
+ '%s: Non-uniform pixel ratio (%s). %s'
+ % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
else:
assert fixup_policy in ('ignore', 'never')
- if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
+ if (info_dict.get('requested_formats') is None and
+ info_dict.get('container') == 'm4a_dash'):
if fixup_policy == 'warn':
- self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
- info_dict['id']))
+ self.report_warning(
+ '%s: writing DASH m4a. '
+ 'Only some players support this container.'
+ % info_dict['id'])
elif fixup_policy == 'detect_or_warn':
fixup_pp = FFmpegFixupM4aPP(self)
if fixup_pp.available:
@@ -1665,8 +1673,27 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(fixup_pp)
else:
self.report_warning(
- '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
- info_dict['id']))
+ '%s: writing DASH m4a. '
+ 'Only some players support this container. %s'
+ % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
+ else:
+ assert fixup_policy in ('ignore', 'never')
+
+ if (info_dict.get('protocol') == 'm3u8_native' or
+ info_dict.get('protocol') == 'm3u8' and
+ self.params.get('hls_prefer_native')):
+ if fixup_policy == 'warn':
+ self.report_warning('%s: malformated aac bitstream.' % (
+ info_dict['id']))
+ elif fixup_policy == 'detect_or_warn':
+ fixup_pp = FFmpegFixupM3u8PP(self)
+ if fixup_pp.available:
+ info_dict.setdefault('__postprocessors', [])
+ info_dict['__postprocessors'].append(fixup_pp)
+ else:
+ self.report_warning(
+ '%s: malformated aac bitstream. %s'
+ % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
else:
assert fixup_policy in ('ignore', 'never')
@@ -1830,7 +1857,9 @@ class YoutubeDL(object):
if fdict.get('vbr') is not None:
res += '%4dk' % fdict['vbr']
if fdict.get('fps') is not None:
- res += ', %sfps' % fdict['fps']
+ if res:
+ res += ', '
+ res += '%sfps' % fdict['fps']
if fdict.get('acodec') is not None:
if res:
res += ', '
@@ -1873,13 +1902,8 @@ class YoutubeDL(object):
def list_thumbnails(self, info_dict):
thumbnails = info_dict.get('thumbnails')
if not thumbnails:
- tn_url = info_dict.get('thumbnail')
- if tn_url:
- thumbnails = [{'id': '0', 'url': tn_url}]
- else:
- self.to_screen(
- '[info] No thumbnails present for %s' % info_dict['id'])
- return
+ self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
+ return
self.to_screen(
'[info] Thumbnails for %s:' % info_dict['id'])
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index f5f064241..79b389840 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -355,6 +355,7 @@ def _real_main(argv=None):
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
'encoding': opts.encoding,
'extract_flat': opts.extract_flat,
+ 'mark_watched': opts.mark_watched,
'merge_output_format': opts.merge_output_format,
'postprocessors': postprocessors,
'fixup': opts.fixup,
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index b497da696..2771fb5fa 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -326,6 +326,9 @@ def compat_ord(c):
return ord(c)
+compat_os_name = os._name if os.name == 'java' else os.name
+
+
if sys.version_info >= (3, 0):
compat_getenv = os.getenv
compat_expanduser = os.path.expanduser
@@ -346,7 +349,7 @@ else:
# The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
# for different platforms with correct environment variables decoding.
- if os.name == 'posix':
+ if compat_os_name == 'posix':
def compat_expanduser(path):
"""Expand ~ and ~user constructions. If user or $HOME is unknown,
do nothing."""
@@ -370,7 +373,7 @@ else:
userhome = pwent.pw_dir
userhome = userhome.rstrip('/')
return (userhome + path[i:]) or '/'
- elif os.name == 'nt' or os.name == 'ce':
+ elif compat_os_name == 'nt' or compat_os_name == 'ce':
def compat_expanduser(path):
"""Expand ~ and ~user constructs.
@@ -556,6 +559,7 @@ __all__ = [
'compat_itertools_count',
'compat_kwargs',
'compat_ord',
+ 'compat_os_name',
'compat_parse_qs',
'compat_print',
'compat_shlex_split',
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 2d5154051..f39db58f6 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -5,6 +5,7 @@ import re
import sys
import time
+from ..compat import compat_os_name
from ..utils import (
encodeFilename,
error_to_compat_str,
@@ -219,7 +220,7 @@ class FileDownloader(object):
if self.params.get('progress_with_newline', False):
self.to_screen(fullmsg)
else:
- if os.name == 'nt':
+ if compat_os_name == 'nt':
prev_len = getattr(self, '_report_progress_prev_line_length',
0)
if prev_len > len(fullmsg):
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
index 5bc99492b..a5bae9669 100644
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@@ -99,7 +99,8 @@ class FragmentFD(FileDownloader):
state['eta'] = self.calc_eta(
start, time_now, estimated_size,
state['downloaded_bytes'])
- state['speed'] = s.get('speed')
+ state['speed'] = s.get('speed') or ctx.get('speed')
+ ctx['speed'] = state['speed']
ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
self._hook_progress(state)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 1edbfbd28..c5b80f4aa 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -23,7 +23,10 @@ from .alphaporno import AlphaPornoIE
from .animeondemand import AnimeOnDemandIE
from .anitube import AnitubeIE
from .anysex import AnySexIE
-from .aol import AolIE
+from .aol import (
+ AolIE,
+ AolFeaturesIE,
+)
from .allocine import AllocineIE
from .aparat import AparatIE
from .appleconnect import AppleConnectIE
@@ -51,6 +54,7 @@ from .arte import (
from .atresplayer import AtresPlayerIE
from .atttechchannel import ATTTechChannelIE
from .audimedia import AudiMediaIE
+from .audioboom import AudioBoomIE
from .audiomack import AudiomackIE, AudiomackAlbumIE
from .azubu import AzubuIE, AzubuLiveIE
from .baidu import BaiduVideoIE
@@ -74,6 +78,7 @@ from .bleacherreport import (
)
from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
+from .bokecc import BokeCCIE
from .bpb import BpbIE
from .br import BRIE
from .breakcom import BreakIE
@@ -184,6 +189,10 @@ from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .dropbox import DropboxIE
+from .dw import (
+ DWIE,
+ DWArticleIE,
+)
from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
@@ -208,10 +217,7 @@ from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
from .extremetube import ExtremeTubeIE
-from .facebook import (
- FacebookIE,
- FacebookPostIE,
-)
+from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
from .fczenit import FczenitIE
@@ -339,6 +345,7 @@ from .konserthusetplay import KonserthusetPlayIE
from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
+from .kusi import KUSIIE
from .kuwo import (
KuwoIE,
KuwoAlbumIE,
@@ -351,10 +358,9 @@ from .la7 import LA7IE
from .laola1tv import Laola1TvIE
from .lecture2go import Lecture2GoIE
from .lemonde import LemondeIE
-from .letv import (
- LetvIE,
- LetvTvIE,
- LetvPlaylistIE,
+from .leeco import (
+ LeIE,
+ LePlaylistIE,
LetvCloudIE,
)
from .libsyn import LibsynIE
@@ -383,6 +389,7 @@ from .lynda import (
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
from .mailru import MailRuIE
+from .makerschannel import MakersChannelIE
from .makertv import MakerTVIE
from .malemotion import MalemotionIE
from .matchtv import MatchTVIE
@@ -392,6 +399,7 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE
from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
+from .minoto import MinotoIE
from .miomio import MioMioIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
@@ -505,6 +513,7 @@ from .npr import NprIE
from .nrk import (
NRKIE,
NRKPlaylistIE,
+ NRKSkoleIE,
NRKTVIE,
)
from .ntvde import NTVDeIE
@@ -669,7 +678,6 @@ from .southpark import (
SouthParkEsIE,
SouthParkNlIE
)
-from .space import SpaceIE
from .spankbang import SpankBangIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
@@ -737,6 +745,7 @@ from .tmz import (
TMZArticleIE,
)
from .tnaflix import (
+ TNAFlixNetworkEmbedIE,
TNAFlixIE,
EMPFlixIE,
MovieFapIE,
@@ -798,7 +807,11 @@ from .twitch import (
TwitchBookmarksIE,
TwitchStreamIE,
)
-from .twitter import TwitterCardIE, TwitterIE
+from .twitter import (
+ TwitterCardIE,
+ TwitterIE,
+ TwitterAmplifyIE,
+)
from .ubu import UbuIE
from .udemy import (
UdemyIE,
@@ -809,6 +822,7 @@ from .digiteka import DigitekaIE
from .unistra import UnistraIE
from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE
+from .ustudio import UstudioIE
from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
@@ -822,7 +836,10 @@ from .vgtv import (
VGTVIE,
)
from .vh1 import VH1IE
-from .vice import ViceIE
+from .vice import (
+ ViceIE,
+ ViceShowIE,
+)
from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
@@ -849,6 +866,7 @@ from .vimeo import (
VimeoChannelIE,
VimeoGroupsIE,
VimeoLikesIE,
+ VimeoOndemandIE,
VimeoReviewIE,
VimeoUserIE,
VimeoWatchLaterIE,
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index b51eafc45..b761b2cc4 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -1,24 +1,11 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class AolIE(InfoExtractor):
IE_NAME = 'on.aol.com'
- _VALID_URL = r'''(?x)
- (?:
- aol-video:|
- http://on\.aol\.com/
- (?:
- video/.*-|
- playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid=
- )
- )
- (?P<id>[0-9]+)
- (?:$|\?)
- '''
+ _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)'
_TESTS = [{
'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
@@ -29,42 +16,31 @@ class AolIE(InfoExtractor):
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
},
'add_ie': ['FiveMin'],
- }, {
- 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316',
- 'info_dict': {
- 'id': '152147',
- 'title': 'Brace Yourself - Today\'s Weirdest News',
- },
- 'playlist_mincount': 10,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- playlist_id = mobj.group('playlist_id')
- if not playlist_id or self._downloader.params.get('noplaylist'):
- return self.url_result('5min:%s' % video_id)
+ video_id = self._match_id(url)
+ return self.url_result('5min:%s' % video_id)
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- webpage = self._download_webpage(url, playlist_id)
- title = self._html_search_regex(
- r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
- playlist_html = self._search_regex(
- r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
- 'playlist HTML')
- entries = [{
- '_type': 'url',
- 'url': 'aol-video:%s' % m.group('id'),
- 'ie_key': 'Aol',
- } for m in re.finditer(
- r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
- playlist_html)]
+class AolFeaturesIE(InfoExtractor):
+ IE_NAME = 'features.aol.com'
+ _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)'
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'display_id': mobj.group('playlist_display_id'),
- 'title': title,
- 'entries': entries,
- }
+ _TESTS = [{
+ 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts',
+ 'md5': '7db483bb0c09c85e241f84a34238cc75',
+ 'info_dict': {
+ 'id': '519507715',
+ 'ext': 'mp4',
+ 'title': 'What To Watch - February 17, 2016',
+ },
+ 'add_ie': ['FiveMin'],
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ return self.url_result(self._search_regex(
+ r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"',
+ webpage, '5min embed url'), 'FiveMin')
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 62ed0c918..be40f85b4 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -12,7 +12,7 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
IE_NAME = 'appletrailers'
- _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TESTS = [{
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
@@ -73,6 +73,9 @@ class AppleTrailersIE(InfoExtractor):
}, {
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
'only_matching': True,
+ }, {
+ 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
+ 'only_matching': True,
}]
_JSON_RE = r'iTunes.playURL\((.*?)\);'
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 793da2ee1..3e119e21b 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor):
class ArteTVPlus7IE(InfoExtractor):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
@classmethod
def _extract_url_info(cls, url):
@@ -110,17 +110,29 @@ class ArteTVPlus7IE(InfoExtractor):
# en and es URLs produce react-based pages with different layout (e.g.
# http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
if not iframe_url:
- embed_html = self._parse_json(
- self._search_regex(
- r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
- webpage, 'program'),
- video_id)['embed_html']
- iframe_url = find_iframe_url(embed_html)
- json_url = compat_parse_qs(
- compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
- return self._extract_from_json_url(json_url, video_id, lang)
-
- def _extract_from_json_url(self, json_url, video_id, lang):
+ program = self._search_regex(
+ r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
+ webpage, 'program', default=None)
+ if program:
+ embed_html = self._parse_json(program, video_id)
+ if embed_html:
+ iframe_url = find_iframe_url(embed_html['embed_html'])
+ if iframe_url:
+ json_url = compat_parse_qs(
+ compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
+ if json_url:
+ title = self._search_regex(
+ r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
+ webpage, 'title', default=None, group='title')
+ return self._extract_from_json_url(json_url, video_id, lang, title=title)
+ # Different kind of embed URL (e.g.
+ # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
+ webpage, 'embed url', group='url')
+ return self.url_result(embed_url)
+
+ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
@@ -128,7 +140,7 @@ class ArteTVPlus7IE(InfoExtractor):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
- title = player_info['VTI'].strip()
+ title = (player_info.get('VTI') or title or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
@@ -230,6 +242,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
'id': '050940-028-A',
'ext': 'mp4',
'title': 'Les écrevisses aussi peuvent être anxieuses',
+ 'upload_date': '20140902',
},
}, {
'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable',
@@ -294,12 +307,25 @@ class ArteTVMagazineIE(ArteTVPlus7IE):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
_TESTS = [{
+ # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..."
'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium',
- 'md5': '66a093339c1278bb3719157ef07107b2',
+ 'md5': '2a9369bcccf847d1c741e51416299f25',
'info_dict': {
'id': '065965-000-A',
'ext': 'mp4',
'title': 'Trepalium - Extrait Ep.01',
+ 'upload_date': '20160121',
+ },
+ }, {
+ # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium"
+ 'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium',
+ 'md5': 'fedc64fc7a946110fe311634e79782ca',
+ 'info_dict': {
+ 'id': '054813-004_PLUS7-F',
+ 'ext': 'mp4',
+ 'title': 'Trepalium (4/6)',
+ 'description': 'md5:10057003c34d54e95350be4f9b05cb40',
+ 'upload_date': '20160218',
},
}, {
'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis',
diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py
index 3b2effa15..aa6925623 100644
--- a/youtube_dl/extractor/audimedia.py
+++ b/youtube_dl/extractor/audimedia.py
@@ -10,9 +10,9 @@ from ..utils import (
class AudiMediaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)'
_TEST = {
- 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test',
+ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
'md5': '79a8b71c46d49042609795ab59779b66',
'info_dict': {
'id': '1565',
@@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload')
+ raw_payload = self._search_regex([
+ r'class="amtv-embed"[^>]+id="([^"]+)"',
+ r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"',
+ ], webpage, 'raw payload')
_, stage_mode, video_id, lang = raw_payload.split('-')
# TODO: handle s and e stage_mode (live streams and ended live streams)
@@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor):
video_version_url = video_version.get('download_url') or video_version.get('stream_url')
if not video_version_url:
continue
- formats.append({
+ f = {
'url': video_version_url,
'width': int_or_none(video_version.get('width')),
'height': int_or_none(video_version.get('height')),
'abr': int_or_none(video_version.get('audio_bitrate')),
'vbr': int_or_none(video_version.get('video_bitrate')),
- })
+ }
+ bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
+ if bitrate:
+ f.update({
+ 'format_id': 'http-%s' % bitrate,
+ })
+ formats.append(f)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
new file mode 100644
index 000000000..2ec2d7092
--- /dev/null
+++ b/youtube_dl/extractor/audioboom.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AudioBoomIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0',
+ 'md5': '63a8d73a055c6ed0f1e51921a10a5a76',
+ 'info_dict': {
+ 'id': '4279833',
+ 'ext': 'mp3',
+ 'title': '3/09/2016 Czaban Hour 3',
+ 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
+ 'duration': 2245.72,
+ 'uploader': 'Steve Czaban',
+ 'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ clip = None
+
+ clip_store = self._parse_json(
+ self._search_regex(
+ r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id,
+ webpage, 'clip store', default='{}', group='json'),
+ video_id, fatal=False)
+ if clip_store:
+ clips = clip_store.get('clips')
+ if clips and isinstance(clips, list) and isinstance(clips[0], dict):
+ clip = clips[0]
+
+ def from_clip(field):
+ if clip:
+ clip.get(field)
+
+ audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
+ 'audio', webpage, 'audio url')
+ title = from_clip('title') or self._og_search_title(webpage)
+ description = from_clip('description') or self._og_search_description(webpage)
+
+ duration = float_or_none(from_clip('duration') or self._html_search_meta(
+ 'weibo:audio:duration', webpage))
+
+ uploader = from_clip('author') or self._og_search_property(
+ 'audio:artist', webpage, 'uploader', fatal=False)
+ uploader_url = from_clip('author_url') or self._html_search_meta(
+ 'audioboo:channel', webpage, 'uploader url')
+
+ return {
+ 'id': video_id,
+ 'url': audio_url,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_url': uploader_url,
+ }
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 9d0dfb961..e62b3860e 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -10,7 +10,6 @@ from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
- remove_end,
unescapeHTML,
)
from ..compat import (
@@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
- 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ 'title': 'BUGGER',
},
'playlist_count': 18,
}, {
@@ -670,10 +669,18 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
- 'title': 'What Liverpool can expect from Klopp',
+ 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
},
'playlist_count': 3,
}, {
+ # school report article with single video
+ 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+ 'info_dict': {
+ 'id': '35744779',
+ 'title': 'School which breaks down barriers in Jerusalem',
+ },
+ 'playlist_count': 1,
+ }, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
'only_matching': True,
@@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE):
json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
timestamp = json_ld_info.get('timestamp')
+
playlist_title = json_ld_info.get('title')
- playlist_description = json_ld_info.get('description')
+ if not playlist_title:
+ playlist_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ if playlist_title:
+ playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+ playlist_description = json_ld_info.get(
+ 'description') or self._og_search_description(webpage, default=None)
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
@@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE):
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
if entries:
- playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
- playlist_description = playlist_description or self._og_search_description(webpage, default=None)
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
@@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
- playlist_title = self._html_search_regex(
- r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
- playlist_description = self._og_search_description(webpage, default=None)
-
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 38bda3af5..7a8e1f60b 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor):
'add_ie': ['Ooyala'],
}, {
'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
- 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50',
+ 'md5': '6a5cd403418c7b01719248ca97fb0692',
'info_dict': {
'id': '2586817',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
@@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE):
'md5': '8c2c12e3af7805152675446c905d159b',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py
new file mode 100644
index 000000000..122a1cbb6
--- /dev/null
+++ b/youtube_dl/extractor/bokecc.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import ExtractorError
+
+
+class BokeCCBaseIE(InfoExtractor):
+ def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
+ player_params_str = self._html_search_regex(
+ r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
+ webpage, 'player params')
+
+ player_params = compat_parse_qs(player_params_str)
+
+ info_xml = self._download_xml(
+ 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
+ player_params['siteid'][0], player_params['vid'][0]), video_id)
+
+ formats = [{
+ 'format_id': format_id,
+ 'url': quality.find('./copy').attrib['playurl'],
+ 'preference': int(quality.attrib['value']),
+ } for quality in info_xml.findall('./video/quality')]
+
+ self._sort_formats(formats)
+
+ return formats
+
+
+class BokeCCIE(BokeCCBaseIE):
+ _IE_DESC = 'CC视频'
+ _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
+
+ _TESTS = [{
+ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B',
+ 'info_dict': {
+ 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30',
+ 'ext': 'flv',
+ 'title': 'BokeCC Video',
+ },
+ }]
+
+ def _real_extract(self, url):
+ qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ if not qs.get('vid') or not qs.get('uid'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
+
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': 'BokeCC Video', # no title provided in the webpage
+ 'formats': self._extract_bokecc_formats(webpage, video_id),
+ }
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
index cb96c3876..cac8fdcba 100644
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@@ -4,12 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import js_to_json
class C56IE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
@@ -18,12 +19,29 @@ class C56IE(InfoExtractor):
'title': '网事知多少 第32期:车怒',
'duration': 283.813,
},
- }
+ }, {
+ 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '82247482',
+ 'title': '爱的诅咒之杜鹃花开',
+ },
+ 'playlist_count': 7,
+ 'add_ie': ['Sohu'],
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid')
+ webpage = self._download_webpage(url, text_id)
+ sohu_video_info_str = self._search_regex(
+ r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
+ if sohu_video_info_str:
+ sohu_video_info = self._parse_json(
+ sohu_video_info_str, text_id, transform_source=js_to_json)
+ return self.url_result(sohu_video_info['url'], 'Sohu')
+
page = self._download_json(
'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index 6d9cd8abd..042c4f2f1 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -21,6 +21,10 @@ class CinemassacreIE(InfoExtractor):
'title': '“Angry Video Game Nerd: The Movie” – Trailer',
'description': 'md5:fb87405fcb42a331742a0dce2708560b',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
@@ -31,14 +35,18 @@ class CinemassacreIE(InfoExtractor):
'upload_date': '20131002',
'title': 'The Mummy’s Hand (1940)',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
# Youtube embedded video
'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
- 'md5': 'df4cf8a1dcedaec79a73d96d83b99023',
+ 'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9',
'info_dict': {
'id': 'OEVzPCY2T-g',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
'upload_date': '20061207',
'uploader': 'Cinemassacre',
@@ -49,12 +57,12 @@ class CinemassacreIE(InfoExtractor):
{
# Youtube embedded video
'url': 'http://cinemassacre.com/2006/09/01/mckids/',
- 'md5': '6eb30961fa795fedc750eac4881ad2e1',
+ 'md5': '7393c4e0f54602ad110c793eb7a6513a',
'info_dict': {
'id': 'FnxsNhuikpo',
- 'ext': 'mp4',
+ 'ext': 'webm',
'upload_date': '20060901',
- 'uploader': 'Cinemassacre Extras',
+ 'uploader': 'Cinemassacre Extra',
'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
'uploader_id': 'Cinemassacre',
'title': 'AVGN: McKids',
@@ -69,7 +77,11 @@ class CinemassacreIE(InfoExtractor):
'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
'upload_date': '20150525',
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}
]
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 5c3908f72..3cf0bf95b 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -51,9 +51,7 @@ class CNETIE(ThePlatformIE):
uploader = None
uploader_id = None
- mpx_account = data['config']['uvpConfig']['default']['mpx_account']
-
- metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id)
+ metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id)
description = vdata.get('description') or metadata.get('description')
duration = int_or_none(vdata.get('duration')) or metadata.get('duration')
@@ -62,7 +60,7 @@ class CNETIE(ThePlatformIE):
for (fkey, vid) in vdata['files'].items():
if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
continue
- release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid)
+ release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid
if fkey == 'hds':
release_url += '&manifest=f4m'
tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index f411ea763..ecd7da767 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -15,13 +15,14 @@ import math
from ..compat import (
compat_cookiejar,
compat_cookies,
+ compat_etree_fromstring,
compat_getpass,
compat_http_client,
+ compat_os_name,
+ compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urlparse,
- compat_str,
- compat_etree_fromstring,
)
from ..utils import (
NO_DEFAULT,
@@ -46,6 +47,8 @@ from ..utils import (
xpath_with_ns,
determine_protocol,
parse_duration,
+ mimetype2ext,
+ update_url_query,
)
@@ -103,7 +106,7 @@ class InfoExtractor(object):
* protocol The protocol that will be used for the actual
download, lower-case.
"http", "https", "rtsp", "rtmp", "rtmpe",
- "m3u8", or "m3u8_native".
+ "m3u8", "m3u8_native" or "http_dash_segments".
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@@ -156,12 +159,14 @@ class InfoExtractor(object):
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
+ license: License name the video is licensed under.
creator: The main artist who created the video.
release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
+ uploader_url: Full URL to a personal webpage of the video uploader.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{language: subformats}. "subformats" is a list sorted from
@@ -341,7 +346,7 @@ class InfoExtractor(object):
def IE_NAME(self):
return compat_str(type(self).__name__[:-2])
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
""" Returns the response handle """
if note is None:
self.report_download_webpage(video_id)
@@ -350,6 +355,12 @@ class InfoExtractor(object):
self.to_screen('%s' % (note,))
else:
self.to_screen('%s: %s' % (video_id, note))
+ # data, headers and query params will be ignored for `Request` objects
+ if isinstance(url_or_request, compat_str):
+ if query:
+ url_or_request = update_url_query(url_or_request, query)
+ if data or headers:
+ url_or_request = sanitized_Request(url_or_request, data, headers or {})
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -365,13 +376,13 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
if urlh is False:
assert not fatal
return False
@@ -424,7 +435,7 @@ class InfoExtractor(object):
self.to_screen('Saving request to ' + filename)
# Working around MAX_PATH limitation on Windows (see
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
- if os.name == 'nt':
+ if compat_os_name == 'nt':
absfilepath = os.path.abspath(filename)
if len(absfilepath) > 259:
filename = '\\\\?\\' + absfilepath
@@ -458,13 +469,13 @@ class InfoExtractor(object):
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
""" Returns the data of the page as a string """
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -479,10 +490,10 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None):
+ transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
+ url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
if xml_string is False:
return xml_string
if transform_source:
@@ -493,10 +504,10 @@ class InfoExtractor(object):
note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
transform_source=None,
- fatal=True, encoding=None):
+ fatal=True, encoding=None, data=None, headers=None, query=None):
json_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding)
+ encoding=encoding, data=data, headers=headers, query=query)
if (not fatal) and json_string is False:
return None
return self._parse_json(
@@ -593,7 +604,7 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
+ if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -899,6 +910,16 @@ class InfoExtractor(object):
item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
formats)
+ @staticmethod
+ def _remove_duplicate_formats(formats):
+ format_urls = set()
+ unique_formats = []
+ for f in formats:
+ if f['url'] not in format_urls:
+ format_urls.add(f['url'])
+ unique_formats.append(f)
+ formats[:] = unique_formats
+
def _is_valid_url(self, url, video_id, item='video'):
url = self._proto_relative_url(url, scheme='http:')
# For now assume non HTTP(S) URLs always valid
@@ -952,6 +973,13 @@ class InfoExtractor(object):
if manifest is False:
return []
+ return self._parse_f4m_formats(
+ manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ transform_source=transform_source, fatal=fatal)
+
+ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
formats = []
manifest_version = '1.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
@@ -977,7 +1005,8 @@ class InfoExtractor(object):
# bitrate in f4m downloader
if determine_ext(manifest_url) == 'f4m':
formats.extend(self._extract_f4m_formats(
- manifest_url, video_id, preference, f4m_id, fatal=fatal))
+ manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ transform_source=transform_source, fatal=fatal))
continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
@@ -1022,11 +1051,21 @@ class InfoExtractor(object):
return []
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
- # A Media Playlist Tag MUST NOT appear in a Master Playlist
- # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
- # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
- if '#EXT-X-TARGETDURATION' in m3u8_doc:
+
+ # We should try extracting formats only from master playlists [1], i.e.
+ # playlists that describe available qualities. On the other hand media
+ # playlists [2] should be returned as is since they contain just the media
+ # without qualities renditions.
+ # Fortunately, master playlist can be easily distinguished from media
+ # playlist based on particular tags availability. As of [1, 2] master
+ # playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
+ # and MUST NOT appear in master playlist thus we can clearly detect media
+ # playlist with this criterion.
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
+ # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
+ # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
'format_id': m3u8_id,
@@ -1073,19 +1112,29 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}
- codecs = last_info.get('CODECS')
- if codecs:
- # TODO: looks like video codec is not always necessarily goes first
- va_codecs = codecs.split(',')
- if va_codecs[0]:
- f['vcodec'] = va_codecs[0]
- if len(va_codecs) > 1 and va_codecs[1]:
- f['acodec'] = va_codecs[1]
resolution = last_info.get('RESOLUTION')
if resolution:
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
+ codecs = last_info.get('CODECS')
+ if codecs:
+ vcodec, acodec = [None] * 2
+ va_codecs = codecs.split(',')
+ if len(va_codecs) == 1:
+ # Audio only entries usually come with single codec and
+ # no resolution. For more robustness we also check it to
+ # be mp4 audio.
+ if not resolution and va_codecs[0].startswith('mp4a'):
+ vcodec, acodec = 'none', va_codecs[0]
+ else:
+ vcodec = va_codecs[0]
+ else:
+ vcodec, acodec = va_codecs[:2]
+ f.update({
+ 'acodec': acodec,
+ 'vcodec': vcodec,
+ })
if last_media is not None:
f['m3u8_media'] = last_media
last_media = None
@@ -1106,8 +1155,8 @@ class InfoExtractor(object):
out.append('{%s}%s' % (namespace, c))
return '/'.join(out)
- def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
- smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if smil is False:
assert not fatal
@@ -1124,10 +1173,10 @@ class InfoExtractor(object):
return {}
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
- def _download_smil(self, smil_url, video_id, fatal=True):
+ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
return self._download_xml(
smil_url, video_id, 'Downloading SMIL file',
- 'Unable to download SMIL file', fatal=fatal)
+ 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
@@ -1277,16 +1326,7 @@ class InfoExtractor(object):
if not src or src in urls:
continue
urls.append(src)
- ext = textstream.get('ext') or determine_ext(src)
- if not ext:
- type_ = textstream.get('type')
- SUBTITLES_TYPES = {
- 'text/vtt': 'vtt',
- 'text/srt': 'srt',
- 'application/smptett+xml': 'tt',
- }
- if type_ in SUBTITLES_TYPES:
- ext = SUBTITLES_TYPES[type_]
+ ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
subtitles.setdefault(lang, []).append({
'url': src,
@@ -1422,8 +1462,9 @@ class InfoExtractor(object):
continue
representation_attrib = adaptation_set.attrib.copy()
representation_attrib.update(representation.attrib)
- mime_type = representation_attrib.get('mimeType')
- content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
+ # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+ mime_type = representation_attrib['mimeType']
+ content_type = mime_type.split('/')[0]
if content_type == 'text':
# TODO implement WebVTT downloading
pass
@@ -1446,6 +1487,7 @@ class InfoExtractor(object):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'url': base_url,
+ 'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
@@ -1598,6 +1640,15 @@ class InfoExtractor(object):
def _get_automatic_captions(self, *args, **kwargs):
raise NotImplementedError('This method must be implemented by subclasses')
+ def mark_watched(self, *args, **kwargs):
+ if (self._downloader.params.get('mark_watched', False) and
+ (self._get_login_info()[0] is not None or
+ self._downloader.params.get('cookiefile') is not None)):
+ self._mark_watched(*args, **kwargs)
+
+ def _mark_watched(self, *args, **kwargs):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 373b3b4b4..bdc768c78 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):
'display_id': 'iseven',
'ext': 'flv',
'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- 'description': 'md5:c93d6692dde6fe33809a46edcbecca44',
+ 'description': 'md5:f34981259a03e980a3c6404190a3ed61',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': '7师傅',
'uploader_id': '431925',
@@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
}, {
'url': 'http://www.douyutv.com/85982',
'info_dict': {
@@ -42,7 +42,24 @@ class DouyuTVIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Romm not found',
+ }, {
+ 'url': 'http://www.douyutv.com/17732',
+ 'info_dict': {
+ 'id': '17732',
+ 'display_id': '17732',
+ 'ext': 'flv',
+ 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:f34981259a03e980a3c6404190a3ed61',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': '7师傅',
+ 'uploader_id': '431925',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 6cda56a7f..a638c827c 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -1,6 +1,8 @@
-# encoding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
+import json
+import re
import time
from .common import InfoExtractor
@@ -8,44 +10,125 @@ from ..utils import int_or_none
class DPlayIE(InfoExtractor):
- _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)'
+ _VALID_URL = r'http://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
- _TEST = {
+ _TESTS = [{
+ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/',
+ 'info_dict': {
+ 'id': '1255600',
+ 'display_id': 'stagione-1-episodio-25',
+ 'ext': 'mp4',
+ 'title': 'Episodio 25',
+ 'description': 'md5:cae5f40ad988811b197d2d27a53227eb',
+ 'duration': 2761,
+ 'timestamp': 1454701800,
+ 'upload_date': '20160205',
+ 'creator': 'RTIT',
+ 'series': 'Take me out',
+ 'season_number': 1,
+ 'episode_number': 25,
+ 'age_limit': 0,
+ },
+ 'expected_warnings': ['Unable to download f4m manifest'],
+ }, {
'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
'info_dict': {
'id': '3172',
- 'ext': 'mp4',
'display_id': 'season-1-svensken-lar-sig-njuta-av-livet',
+ 'ext': 'flv',
'title': 'Svensken lär sig njuta av livet',
+ 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8',
'duration': 2650,
+ 'timestamp': 1365454320,
+ 'upload_date': '20130408',
+ 'creator': 'Kanal 5 (Home)',
+ 'series': 'Nugammalt - 77 händelser som format Sverige',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'age_limit': 0,
},
- }
+ }, {
+ 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/',
+ 'info_dict': {
+ 'id': '70816',
+ 'display_id': 'season-6-episode-12',
+ 'ext': 'flv',
+ 'title': 'Episode 12',
+ 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90',
+ 'duration': 2563,
+ 'timestamp': 1429696800,
+ 'upload_date': '20150422',
+ 'creator': 'Kanal 4',
+ 'series': 'Mig og min mor',
+ 'season_number': 6,
+ 'episode_number': 12,
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ domain = mobj.group('domain')
+
webpage = self._download_webpage(url, display_id)
+
video_id = self._search_regex(
- r'data-video-id="(\d+)"', webpage, 'video id')
+ r'data-video-id=["\'](\d+)', webpage, 'video id')
info = self._download_json(
- 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id,
+ 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id),
video_id)['data'][0]
- self._set_cookie(
- 'secure.dplay.se', 'dsc-geo',
- '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000))
- # TODO: consider adding support for 'stream_type=hds', it seems to
- # require setting some cookies
- manifest_url = self._download_json(
- 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id,
- video_id, 'Getting manifest url for hls stream')['hls']
- formats = self._extract_m3u8_formats(
- manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native')
+ title = info['title']
+
+ PROTOCOLS = ('hls', 'hds')
+ formats = []
+
+ def extract_formats(protocol, manifest_url):
+ if protocol == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ manifest_url, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False))
+ elif protocol == 'hds':
+ formats.extend(self._extract_f4m_formats(
+ manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0',
+ video_id, f4m_id=protocol, fatal=False))
+
+ domain_tld = domain.split('.')[-1]
+ if domain_tld in ('se', 'dk'):
+ for protocol in PROTOCOLS:
+ self._set_cookie(
+ 'secure.dplay.%s' % domain_tld, 'dsc-geo',
+ json.dumps({
+ 'countryCode': domain_tld.upper(),
+ 'expiry': (time.time() + 20 * 60) * 1000,
+ }))
+ stream = self._download_json(
+ 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s'
+ % (domain_tld, video_id, protocol), video_id,
+ 'Downloading %s stream JSON' % protocol, fatal=False)
+ if stream and stream.get(protocol):
+ extract_formats(protocol, stream[protocol])
+ else:
+ for protocol in PROTOCOLS:
+ if info.get(protocol):
+ extract_formats(protocol, info[protocol])
return {
'id': video_id,
'display_id': display_id,
- 'title': info['title'],
- 'formats': formats,
+ 'title': title,
+ 'description': info.get('video_metadata_longDescription'),
'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
+ 'timestamp': int_or_none(info.get('video_publish_date')),
+ 'creator': info.get('video_metadata_homeChannel'),
+ 'series': info.get('video_metadata_show'),
+ 'season_number': int_or_none(info.get('season')),
+ 'episode_number': int_or_none(info.get('episode')),
+ 'age_limit': int_or_none(info.get('minimum_age')),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py
new file mode 100644
index 000000000..b6c985547
--- /dev/null
+++ b/youtube_dl/extractor/dw.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+from ..compat import compat_urlparse
+
+
+class DWIE(InfoExtractor):
+ IE_NAME = 'dw'
+ _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+ _TESTS = [{
+ # video
+ 'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
+ 'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+ 'info_dict': {
+ 'id': '19112290',
+ 'ext': 'mp4',
+ 'title': 'Intelligent light',
+ 'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
+ 'upload_date': '20160311',
+ }
+ }, {
+ # audio
+ 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941',
+ 'md5': '2814c9a1321c3a51f8a7aeb067a360dd',
+ 'info_dict': {
+ 'id': '19111941',
+ 'ext': 'mp3',
+ 'title': 'WorldLink: My business',
+ 'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
+ 'upload_date': '20160311',
+ }
+ }]
+
+ def _real_extract(self, url):
+ media_id = self._match_id(url)
+ webpage = self._download_webpage(url, media_id)
+ hidden_inputs = self._hidden_inputs(webpage)
+ title = hidden_inputs['media_title']
+
+ formats = []
+ if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+ formats = self._extract_smil_formats(
+ 'http://www.dw.com/smil/v-%s' % media_id, media_id,
+ transform_source=lambda s: s.replace(
+ 'rtmp://tv-od.dw.de/flash/',
+ 'http://tv-download.dw.de/dwtv_video/flv/'))
+ else:
+ formats = [{'url': hidden_inputs['file_name']}]
+
+ return {
+ 'id': media_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': hidden_inputs.get('preview_image'),
+ 'duration': int_or_none(hidden_inputs.get('file_duration')),
+ 'upload_date': hidden_inputs.get('display_date'),
+ 'formats': formats,
+ }
+
+
+class DWArticleIE(InfoExtractor):
+ IE_NAME = 'dw:article'
+ _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
+ 'md5': '8ca657f9d068bbef74d6fc38b97fc869',
+ 'info_dict': {
+ 'id': '19105868',
+ 'ext': 'mp4',
+ 'title': 'The harsh life of refugees in Idomeni',
+ 'description': 'md5:196015cc7e48ebf474db9399420043c7',
+ 'upload_date': '20160310',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ hidden_inputs = self._hidden_inputs(webpage)
+ media_id = hidden_inputs['media_id']
+ media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
+ media_url = compat_urlparse.urljoin(url, media_path)
+ return self.url_result(media_url, 'DW', media_id)
diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py
index 00a69e631..8c725a4e6 100644
--- a/youtube_dl/extractor/elpais.py
+++ b/youtube_dl/extractor/elpais.py
@@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESC = 'El País'
- _TEST = {
+ _TESTS = [{
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
'md5': '98406f301f19562170ec071b83433d55',
'info_dict': {
@@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor):
'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
'upload_date': '20140206',
}
- }
+ }, {
+ 'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t',
+ 'md5': '3bd5b09509f3519d7d9e763179b013de',
+ 'info_dict': {
+ 'id': '1456340311_668921',
+ 'ext': 'mp4',
+ 'title': 'Cómo hacer el mejor café con cafetera italiana',
+ 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.',
+ 'upload_date': '20160303',
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
- r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
+ r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')
video_suffix = self._search_regex(
- r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
+ r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')
video_url = prefix + video_suffix
thumbnail_suffix = self._search_regex(
- r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
- fatal=False)
+ r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'",
+ webpage, 'thumbnail URL', fatal=False)
thumbnail = (
None if thumbnail_suffix is None
else prefix + thumbnail_suffix)
title = self._html_search_regex(
- '<h2 class="entry-header entry-title.*?>(.*?)</h2>',
+ (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title',
+ r'<h2 class="entry-header entry-title.*?>(.*?)</h2>'),
webpage, 'title')
- date_str = self._search_regex(
+ upload_date = unified_strdate(self._search_regex(
r'<p class="date-header date-int updated"\s+title="([^"]+)">',
- webpage, 'upload date', fatal=False)
- upload_date = (None if date_str is None else unified_strdate(date_str))
+ webpage, 'upload date', default=None) or self._html_search_meta(
+ 'datePublished', webpage, 'timestamp'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index e4180701d..e5e57d485 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -1,21 +1,13 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- url_basename,
-)
class EngadgetIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://www.engadget.com/
- (?:video(?:/5min)?/(?P<id>\d+)|
- [\d/]+/.*?)
- '''
+ _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.engadget.com/video/5min/518153925/',
+ 'url': 'http://www.engadget.com/video/518153925/',
'md5': 'c6820d4828a5064447a4d9fc73f312c9',
'info_dict': {
'id': '518153925',
@@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
- if video_id is not None:
- return self.url_result('5min:%s' % video_id)
- else:
- title = url_basename(url)
- webpage = self._download_webpage(url, title)
- ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': [self.url_result('5min:%s' % vid) for vid in ids]
- }
+ return self.url_result('5min:%s' % video_id)
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 0a9a5ca71..f5bbd39d2 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -34,9 +34,12 @@ class FacebookIE(InfoExtractor):
video/video\.php|
photo\.php|
video\.php|
- video/embed
- )\?(?:.*?)(?:v|video_id)=|
- [^/]+/videos/(?:[^/]+/)?
+ video/embed|
+ story\.php
+ )\?(?:.*?)(?:v|video_id|story_fbid)=|
+ [^/]+/videos/(?:[^/]+/)?|
+ [^/]+/posts/|
+ groups/[^/]+/permalink/
)|
facebook:
)
@@ -49,6 +52,8 @@ class FacebookIE(InfoExtractor):
_CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
+ _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
+
_TESTS = [{
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
'md5': '6a40d33c0eccbb1af76cf0485a052659',
@@ -81,6 +86,33 @@ class FacebookIE(InfoExtractor):
'uploader': 'Demy de Zeeuw',
},
}, {
+ 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
+ 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
+ 'info_dict': {
+ 'id': '544765982287235',
+ 'ext': 'mp4',
+ 'title': '"What are you doing running in the snow?"',
+ 'uploader': 'FailArmy',
+ }
+ }, {
+ 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
+ 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
+ 'info_dict': {
+ 'id': '1035862816472149',
+ 'ext': 'mp4',
+ 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog',
+ 'uploader': 'S. Saint',
+ },
+ }, {
+ 'note': 'swf params escaped',
+ 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
+ 'md5': '97ba073838964d12c70566e0085c2b91',
+ 'info_dict': {
+ 'id': '10153664894881749',
+ 'ext': 'mp4',
+ 'title': 'Facebook video #10153664894881749',
+ },
+ }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
}, {
@@ -92,6 +124,9 @@ class FacebookIE(InfoExtractor):
}, {
'url': 'facebook:544765982287235',
'only_matching': True,
+ }, {
+ 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
+ 'only_matching': True,
}]
def _login(self):
@@ -160,19 +195,19 @@ class FacebookIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _real_extract(self, url):
- video_id = self._match_id(url)
- req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
+ def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
+ req = sanitized_Request(url)
req.add_header('User-Agent', self._CHROME_USER_AGENT)
webpage = self._download_webpage(req, video_id)
video_data = None
- BEFORE = '{swf.addParam(param[0], param[1]);});\n'
+ BEFORE = '{swf.addParam(param[0], param[1]);});'
AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
- m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
+ m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
if m:
- data = dict(json.loads(m.group(1)))
+ swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+ data = dict(json.loads(swf_params))
params_raw = compat_urllib_parse_unquote(data['params'])
video_data = json.loads(params_raw)['video_data']
@@ -185,13 +220,15 @@ class FacebookIE(InfoExtractor):
if not video_data:
server_js_data = self._parse_json(self._search_regex(
- r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id)
+ r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id)
for item in server_js_data.get('instances', []):
if item[1][0] == 'VideoConfig':
video_data = video_data_list2dict(item[2][0]['videoData'])
break
if not video_data:
+ if not fatal_if_no_video:
+ return webpage, False
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
@@ -208,10 +245,13 @@ class FacebookIE(InfoExtractor):
for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type))
if src:
+ preference = -10 if format_id == 'progressive' else 0
+ if quality == 'hd':
+ preference += 5
formats.append({
'format_id': '%s_%s_%s' % (format_id, quality, src_type),
'url': src,
- 'preference': -10 if format_id == 'progressive' else 0,
+ 'preference': preference,
})
dash_manifest = f[0].get('dash_manifest')
if dash_manifest:
@@ -234,39 +274,36 @@ class FacebookIE(InfoExtractor):
video_title = 'Facebook video #%s' % video_id
uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
- return {
+ info_dict = {
'id': video_id,
'title': video_title,
'formats': formats,
'uploader': uploader,
}
-
-class FacebookPostIE(InfoExtractor):
- IE_NAME = 'facebook:post'
- _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
- 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
- 'info_dict': {
- 'id': '544765982287235',
- 'ext': 'mp4',
- 'title': '"What are you doing running in the snow?"',
- 'uploader': 'FailArmy',
- }
- }
+ return webpage, info_dict
def _real_extract(self, url):
- post_id = self._match_id(url)
+ video_id = self._match_id(url)
+
+ real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
+ webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
- webpage = self._download_webpage(url, post_id)
+ if info_dict:
+ return info_dict
- entries = [
- self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
- for video_id in self._parse_json(
- self._search_regex(
- r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
- webpage, 'video ids', group='ids'),
- post_id)]
+ if '/posts/' in url:
+ entries = [
+ self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
+ for vid in self._parse_json(
+ self._search_regex(
+ r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
+ webpage, 'video ids', group='ids'),
+ video_id)]
- return self.playlist_result(entries, post_id)
+ return self.playlist_result(entries, video_id)
+ else:
+ _, info_dict = self._extract_from_url(
+ self._VIDEO_PAGE_TEMPLATE % video_id,
+ video_id, fatal_if_no_video=True)
+ return info_dict
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index 6f9b003c2..fd535457d 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -52,7 +52,7 @@ class FazIE(InfoExtractor):
formats = []
for pref, code in enumerate(['LOW', 'HIGH', 'HQ']):
encoding = xpath_element(encodings, code)
- if encoding:
+ if encoding is not None:
encoding_url = xpath_text(encoding, 'FILENAME')
if encoding_url:
formats.append({
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 2955965d9..67d50a386 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -1,5 +1,7 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
@@ -16,12 +18,7 @@ from ..utils import (
class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
- _VALID_URL = r'''(?x)
- (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
- https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
- 5min:)
- (?P<id>\d+)
- '''
+ _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'
_TESTS = [
{
@@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor):
'title': 'How to Make a Next-Level Fruit Salad',
'duration': 184,
},
+ 'skip': 'no longer available',
},
]
_ERRORS = {
@@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ sid = mobj.group('sid')
+
+ if mobj.group('query'):
+ qs = compat_parse_qs(mobj.group('query'))
+ if not qs.get('playList'):
+ raise ExtractorError('Invalid URL', expected=True)
+ video_id = qs['playList'][0]
+ if qs.get('sid'):
+ sid = qs['sid'][0]
+
embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
- embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed page')
- sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
- query = compat_urllib_parse.urlencode({
- 'func': 'GetResults',
- 'playlist': video_id,
- 'sid': sid,
- 'isPlayerSeed': 'true',
- 'url': embed_url,
- })
+ if not sid:
+ embed_page = self._download_webpage(embed_url, video_id,
+ 'Downloading embed page')
+ sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
+
response = self._download_json(
- 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
+ 'https://syn.5min.com/handlers/SenseHandler.ashx?' +
+ compat_urllib_parse.urlencode({
+ 'func': 'GetResults',
+ 'playlist': video_id,
+ 'sid': sid,
+ 'isPlayerSeed': 'true',
+ 'url': embed_url,
+ }),
video_id)
if not response['success']:
raise ExtractorError(
@@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor):
parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
for rendition in info['Renditions']:
- if rendition['RenditionType'] == 'm3u8':
- formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
- elif rendition['RenditionType'] == 'aac':
+ if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':
continue
else:
rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
index 318ac013d..1dc50318c 100644
--- a/youtube_dl/extractor/foxnews.py
+++ b/youtube_dl/extractor/foxnews.py
@@ -36,6 +36,10 @@ class FoxNewsIE(AMPIE):
# 'upload_date': '20141204',
'thumbnail': 're:^https?://.*\.jpg$',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py
index c210177f7..1477708bb 100644
--- a/youtube_dl/extractor/freespeech.py
+++ b/youtube_dl/extractor/freespeech.py
@@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor):
'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0',
'info_dict': {
'id': 'poKsVCZ64uU',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Obama, Romney Campaign in Colorado Ahead of Debate',
'description': 'Obama, Romney Campaign in Colorado Ahead of Debate',
'uploader': 'freespeechtv',
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 45adbb7a3..8121f04a5 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE
from .svt import SVTIE
from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
+from .tnaflix import TNAFlixNetworkEmbedIE
from .vimeo import VimeoIE
from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
@@ -1241,28 +1242,34 @@ class GenericIE(InfoExtractor):
full_response = self._request_webpage(request, video_id)
head_response = full_response
+ info_dict = {
+ 'id': video_id,
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ }
+
# Check for direct link to a video
content_type = head_response.headers.get('Content-Type', '')
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
if m:
upload_date = unified_strdate(
head_response.headers.get('Last-Modified'))
- formats = []
- if m.group('format_id').endswith('mpegurl'):
+ format_id = m.group('format_id')
+ if format_id.endswith('mpegurl'):
formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+ elif format_id == 'f4m':
+ formats = self._extract_f4m_formats(url, video_id)
else:
formats = [{
'format_id': m.group('format_id'),
'url': url,
'vcodec': 'none' if m.group('type') == 'audio' else None
}]
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ info_dict.update({
'direct': True,
'formats': formats,
'upload_date': upload_date,
- }
+ })
+ return info_dict
if not self._downloader.params.get('test', False) and not is_intentional:
force = self._downloader.params.get('force_generic_extractor', False)
@@ -1290,13 +1297,12 @@ class GenericIE(InfoExtractor):
'URL could be a direct video link, returning it as such.')
upload_date = unified_strdate(
head_response.headers.get('Last-Modified'))
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ info_dict.update({
'direct': True,
'url': url,
'upload_date': upload_date,
- }
+ })
+ return info_dict
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
@@ -1313,12 +1319,12 @@ class GenericIE(InfoExtractor):
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
- 'formats': self._parse_mpd_formats(
- doc, video_id, mpd_base_url=url.rpartition('/')[0]),
- }
+ info_dict['formats'] = self._parse_mpd_formats(
+ doc, video_id, mpd_base_url=url.rpartition('/')[0])
+ return info_dict
+ elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+ info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+ return info_dict
except compat_xml_parse_error:
pass
@@ -1573,6 +1579,11 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'VK')
+ # Look for embedded Odnoklassniki player
+ mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Odnoklassniki')
+
# Look for embedded ivi player
mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
if mobj is not None:
@@ -1628,6 +1639,11 @@ class GenericIE(InfoExtractor):
if xhamster_urls:
return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+ # Look for embedded TNAFlixNetwork player
+ tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
+ if tnaflix_urls:
+ return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+
# Look for embedded Tvigle player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1974,6 +1990,8 @@ class GenericIE(InfoExtractor):
entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
elif ext == 'mpd':
entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+ elif ext == 'f4m':
+ entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
else:
entry_info_dict['url'] = video_url
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py
index f354c9c7a..766fc26d0 100644
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -10,8 +10,8 @@ from ..utils import (
class GoogleDriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})'
- _TEST = {
+ _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+ _TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
'md5': '881f7700aec4f538571fa1e0eed4a7b6',
'info_dict': {
@@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):
'title': 'Big Buck Bunny.mp4',
'duration': 46,
}
- }
+ }, {
+ # video id is longer than 28 characters
+ 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+ 'only_matching': True,
+ }]
_FORMATS_EXT = {
'5': 'flv',
'6': 'flv',
@@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
+ r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
webpage)
if mobj:
return 'https://drive.google.com/file/d/%s' % mobj.group('id')
@@ -82,7 +86,7 @@ class GoogleDriveIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
'duration': duration,
'formats': formats,
}
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 02e1e428e..b61b2dc4e 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -42,7 +42,7 @@ class ImdbIE(InfoExtractor):
for f_url, f_name in extra_formats]
format_pages.append(player_page)
- quality = qualities(['SD', '480p', '720p'])
+ quality = qualities(('SD', '480p', '720p', '1080p'))
formats = []
for format_page in format_pages:
json_data = self._search_regex(
diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py
index 12fb5e8e1..9622f198a 100644
--- a/youtube_dl/extractor/indavideo.py
+++ b/youtube_dl/extractor/indavideo.py
@@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor):
'url': self._proto_relative_url(thumbnail)
} for thumbnail in video.get('thumbnails', [])]
- tags = [tag['title'] for tag in video.get('tags', [])]
+ tags = [tag['title'] for tag in video.get('tags') or []]
return {
'id': video.get('id') or video_id,
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 016af2084..cca0b8a93 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -4,15 +4,12 @@ from __future__ import unicode_literals
import base64
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_parse_qs,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import determine_ext
+from .bokecc import BokeCCBaseIE
-class InfoQIE(InfoExtractor):
+class InfoQIE(BokeCCBaseIE):
_VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
@@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor):
},
}]
- def _extract_bokecc_videos(self, webpage, video_id):
- # TODO: bokecc.com is a Chinese video cloud platform
- # It should have an independent extractor but I don't have other
- # examples using bokecc
- player_params_str = self._html_search_regex(
- r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
- webpage, 'player params', default=None)
-
- player_params = compat_parse_qs(player_params_str)
-
- info_xml = self._download_xml(
- 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
- player_params['siteid'][0], player_params['vid'][0]), video_id)
-
- return [{
- 'format_id': 'bokecc',
- 'url': quality.find('./copy').attrib['playurl'],
- 'preference': int(quality.attrib['value']),
- } for quality in info_xml.findall('./video/quality')]
-
def _extract_rtmp_videos(self, webpage):
# The server URL is hardcoded
video_url = 'rtmpe://video.infoq.com/cfx/st/'
@@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor):
if '/cn/' in url:
# for China videos, HTTP video URL exists but always fails with 403
- formats = self._extract_bokecc_videos(webpage, video_id)
+ formats = self._extract_bokecc_formats(webpage, video_id)
else:
formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage)
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index 9046705a5..e7c0cb3f6 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -2,32 +2,173 @@
from __future__ import unicode_literals
import hashlib
+import itertools
import math
import os
import random
+import re
import time
import uuid
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
+ compat_str,
compat_urllib_parse,
compat_urllib_parse_urlparse,
)
from ..utils import (
+ decode_packed_codes,
ExtractorError,
+ ohdave_rsa_encrypt,
+ remove_start,
sanitized_Request,
urlencode_postdata,
url_basename,
)
+def md5_text(text):
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+
+class IqiyiSDK(object):
+ def __init__(self, target, ip, timestamp):
+ self.target = target
+ self.ip = ip
+ self.timestamp = timestamp
+
+ @staticmethod
+ def split_sum(data):
+ return compat_str(sum(map(lambda p: int(p, 16), list(data))))
+
+ @staticmethod
+ def digit_sum(num):
+ if isinstance(num, int):
+ num = compat_str(num)
+ return compat_str(sum(map(int, num)))
+
+ def even_odd(self):
+ even = self.digit_sum(compat_str(self.timestamp)[::2])
+ odd = self.digit_sum(compat_str(self.timestamp)[1::2])
+ return even, odd
+
+ def preprocess(self, chunksize):
+ self.target = md5_text(self.target)
+ chunks = []
+ for i in range(32 // chunksize):
+ chunks.append(self.target[chunksize * i:chunksize * (i + 1)])
+ if 32 % chunksize:
+ chunks.append(self.target[32 - 32 % chunksize:])
+ return chunks, list(map(int, self.ip.split('.')))
+
+ def mod(self, modulus):
+ chunks, ip = self.preprocess(32)
+ self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip))
+
+ def split(self, chunksize):
+ modulus_map = {
+ 4: 256,
+ 5: 10,
+ 8: 100,
+ }
+
+ chunks, ip = self.preprocess(chunksize)
+ ret = ''
+ for i in range(len(chunks)):
+ ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else ''
+ if chunksize == 8:
+ ret += ip_part + chunks[i]
+ else:
+ ret += chunks[i] + ip_part
+ self.target = ret
+
+ def handle_input16(self):
+ self.target = md5_text(self.target)
+ self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:])
+
+ def handle_input8(self):
+ self.target = md5_text(self.target)
+ ret = ''
+ for i in range(4):
+ part = self.target[8 * i:8 * (i + 1)]
+ ret += self.split_sum(part) + part
+ self.target = ret
+
+ def handleSum(self):
+ self.target = md5_text(self.target)
+ self.target = self.split_sum(self.target) + self.target
+
+ def date(self, scheme):
+ self.target = md5_text(self.target)
+ d = time.localtime(self.timestamp)
+ strings = {
+ 'y': compat_str(d.tm_year),
+ 'm': '%02d' % d.tm_mon,
+ 'd': '%02d' % d.tm_mday,
+ }
+ self.target += ''.join(map(lambda c: strings[c], list(scheme)))
+
+ def split_time_even_odd(self):
+ even, odd = self.even_odd()
+ self.target = odd + md5_text(self.target) + even
+
+ def split_time_odd_even(self):
+ even, odd = self.even_odd()
+ self.target = even + md5_text(self.target) + odd
+
+ def split_ip_time_sum(self):
+ chunks, ip = self.preprocess(32)
+ self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp)
+
+ def split_time_ip_sum(self):
+ chunks, ip = self.preprocess(32)
+ self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip))
+
+
+class IqiyiSDKInterpreter(object):
+ def __init__(self, sdk_code):
+ self.sdk_code = sdk_code
+
+ def run(self, target, ip, timestamp):
+ self.sdk_code = decode_packed_codes(self.sdk_code)
+
+ functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
+
+ sdk = IqiyiSDK(target, ip, timestamp)
+
+ other_functions = {
+ 'handleSum': sdk.handleSum,
+ 'handleInput8': sdk.handle_input8,
+ 'handleInput16': sdk.handle_input16,
+ 'splitTimeEvenOdd': sdk.split_time_even_odd,
+ 'splitTimeOddEven': sdk.split_time_odd_even,
+ 'splitIpTimeSum': sdk.split_ip_time_sum,
+ 'splitTimeIpSum': sdk.split_time_ip_sum,
+ }
+ for function in functions:
+ if re.match(r'mod\d+', function):
+ sdk.mod(int(function[3:]))
+ elif re.match(r'date[ymd]{3}', function):
+ sdk.date(function[4:])
+ elif re.match(r'split\d+', function):
+ sdk.split(int(function[5:]))
+ elif function in other_functions:
+ other_functions[function]()
+ else:
+ raise ExtractorError('Unknown funcion %s' % function)
+
+ return sdk.target
+
+
class IqiyiIE(InfoExtractor):
IE_NAME = 'iqiyi'
IE_DESC = '爱奇艺'
_VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
+ _NETRC_MACHINE = 'iqiyi'
+
_TESTS = [{
'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
'md5': '2cb594dc2781e6c941a110d8f358118b',
@@ -125,6 +266,13 @@ class IqiyiIE(InfoExtractor):
},
}],
'expected_warnings': ['Needs a VIP account for full video'],
+ }, {
+ 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+ 'info_dict': {
+ 'id': '202918101',
+ 'title': '灌篮高手 国语版',
+ },
+ 'playlist_count': 101,
}]
_FORMATS_MAP = [
@@ -136,9 +284,63 @@ class IqiyiIE(InfoExtractor):
('10', 'h1'),
]
+ def _real_initialize(self):
+ self._login()
+
@staticmethod
- def md5_text(text):
- return hashlib.md5(text.encode('utf-8')).hexdigest()
+ def _rsa_fun(data):
+ # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
+ N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
+ e = 65537
+
+ return ohdave_rsa_encrypt(data, e, N)
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+
+ # No authentication to be performed
+ if not username:
+ return True
+
+ data = self._download_json(
+ 'http://kylin.iqiyi.com/get_token', None,
+ note='Get token for logging', errnote='Unable to get token for logging')
+ sdk = data['sdk']
+ timestamp = int(time.time())
+ target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % (
+ username, self._rsa_fun(password.encode('utf-8')))
+
+ interp = IqiyiSDKInterpreter(sdk)
+ sign = interp.run(target, data['ip'], timestamp)
+
+ validation_params = {
+ 'target': target,
+ 'server': 'BEA3AA1908656AABCCFF76582C4C6660',
+ 'token': data['token'],
+ 'bird_src': 'f8d91d57af224da7893dd397d52d811a',
+ 'sign': sign,
+ 'bird_t': timestamp,
+ }
+ validation_result = self._download_json(
+ 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+ note='Validate credentials', errnote='Unable to validate credentials')
+
+ MSG_MAP = {
+ 'P00107': 'please login via the web interface and enter the CAPTCHA code',
+ 'P00117': 'bad username or password',
+ }
+
+ code = validation_result['code']
+ if code != 'A00000':
+ msg = MSG_MAP.get(code)
+ if not msg:
+ msg = 'error %s' % code
+ if validation_result.get('msg'):
+ msg += ': ' + validation_result['msg']
+ self._downloader.report_warning('unable to log in: ' + msg)
+ return False
+
+ return True
def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
auth_params = {
@@ -199,7 +401,7 @@ class IqiyiIE(InfoExtractor):
note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
)['t']
t = str(int(math.floor(int(tm) / (600.0))))
- return self.md5_text(t + mg + x)
+ return md5_text(t + mg + x)
video_urls_dict = {}
need_vip_warning_report = True
@@ -278,16 +480,16 @@ class IqiyiIE(InfoExtractor):
tail = tm + tvid
param = {
'key': 'fvip',
- 'src': self.md5_text('youtube-dl'),
+ 'src': md5_text('youtube-dl'),
'tvId': tvid,
'vid': video_id,
'vinfo': 1,
'tm': tm,
- 'enc': self.md5_text(enc_key + tail),
+ 'enc': md5_text(enc_key + tail),
'qyid': _uuid,
'tn': random.random(),
'um': 0,
- 'authkey': self.md5_text(self.md5_text('') + tail),
+ 'authkey': md5_text(md5_text('') + tail),
'k_tag': 1,
}
@@ -296,24 +498,62 @@ class IqiyiIE(InfoExtractor):
raw_data = self._download_json(api_url, video_id)
return raw_data
- def get_enc_key(self, swf_url, video_id):
+ def get_enc_key(self, video_id):
# TODO: automatic key extraction
# last update at 2016-01-22 for Zombie::bite
- enc_key = '6ab6d0280511493ba85594779759d4ed'
+ enc_key = '8ed797d224d043e7ac23d95b70227d32'
return enc_key
+ def _extract_playlist(self, webpage):
+ PAGE_SIZE = 50
+
+ links = re.findall(
+ r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+ webpage)
+ if not links:
+ return
+
+ album_id = self._search_regex(
+ r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+ album_title = self._search_regex(
+ r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+ entries = list(map(self.url_result, links))
+
+ # Start from 2 because links in the first page are already on webpage
+ for page_num in itertools.count(2):
+ pagelist_page = self._download_webpage(
+ 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+ album_id,
+ note='Download playlist page %d' % page_num,
+ errnote='Failed to download playlist page %d' % page_num)
+ pagelist = self._parse_json(
+ remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+ vlist = pagelist['data']['vlist']
+ for item in vlist:
+ entries.append(self.url_result(item['vurl']))
+ if len(vlist) < PAGE_SIZE:
+ break
+
+ return self.playlist_result(entries, album_id, album_title)
+
def _real_extract(self, url):
webpage = self._download_webpage(
url, 'temp_id', note='download video page')
+
+ # There's no simple way to determine whether an URL is a playlist or not
+ # So detect it
+ playlist_result = self._extract_playlist(webpage)
+ if playlist_result:
+ return playlist_result
+
tvid = self._search_regex(
r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
video_id = self._search_regex(
r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
- swf_url = self._search_regex(
- r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
_uuid = uuid.uuid4().hex
- enc_key = self.get_enc_key(swf_url, video_id)
+ enc_key = self.get_enc_key(video_id)
raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index eef7daa29..137db873c 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor):
webpage = self._download_webpage(url, title)
title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
config_url = self._html_search_regex(
- r'data-src="(/contenu/medias/video.php.*?)"',
+ r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"',
webpage, 'config URL')
config_url = 'http://www.jeuxvideo.com' + config_url
diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py
index 8e90d5986..6770685d7 100644
--- a/youtube_dl/extractor/jwplatform.py
+++ b/youtube_dl/extractor/jwplatform.py
@@ -7,33 +7,9 @@ from .common import InfoExtractor
from ..utils import int_or_none
-class JWPlatformIE(InfoExtractor):
- _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
- _TEST = {
- 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
- 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
- 'info_dict': {
- 'id': 'nPripu9l',
- 'ext': 'mov',
- 'title': 'Big Buck Bunny Trailer',
- 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
- 'upload_date': '20081127',
- 'timestamp': 1227796140,
- }
- }
-
- @staticmethod
- def _extract_url(webpage):
- mobj = re.search(
- r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
- webpage)
- if mobj:
- return mobj.group('url')
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
- video_data = json_data['playlist'][0]
+class JWPlatformBaseIE(InfoExtractor):
+ def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True):
+ video_data = jwplayer_data['playlist'][0]
subtitles = {}
for track in video_data['tracks']:
if track['kind'] == 'captions':
@@ -43,7 +19,7 @@ class JWPlatformIE(InfoExtractor):
for source in video_data['sources']:
source_url = self._proto_relative_url(source['file'])
source_type = source.get('type') or ''
- if source_type == 'application/vnd.apple.mpegurl':
+ if source_type in ('application/vnd.apple.mpegurl', 'hls'):
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', 'm3u8_native', fatal=False))
elif source_type.startswith('audio'):
@@ -61,10 +37,39 @@ class JWPlatformIE(InfoExtractor):
return {
'id': video_id,
- 'title': video_data['title'],
+ 'title': video_data['title'] if require_title else video_data.get('title'),
'description': video_data.get('description'),
'thumbnail': self._proto_relative_url(video_data.get('image')),
'timestamp': int_or_none(video_data.get('pubdate')),
'subtitles': subtitles,
'formats': formats,
}
+
+
+class JWPlatformIE(JWPlatformBaseIE):
+ _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+ _TEST = {
+ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
+ 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
+ 'info_dict': {
+ 'id': 'nPripu9l',
+ 'ext': 'mov',
+ 'title': 'Big Buck Bunny Trailer',
+ 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
+ 'upload_date': '20081127',
+ 'timestamp': 1227796140,
+ }
+ }
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
+ return self._parse_jwplayer_data(json_data, video_id)
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index ccbc39c66..44d7c84a1 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urlparse,
+ compat_parse_qs,
)
from ..utils import (
clean_html,
@@ -20,21 +21,17 @@ from ..utils import (
class KalturaIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
- kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+ kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
(?:
(?:
# flash player
- index\.php/kwidget/
- (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
- (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+ index\.php/kwidget|
# html5 player
- html5/html5lib/
- (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
- .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+ html5/html5lib/[^/]+/mwEmbedFrame\.php
)
- )
+ )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
)
'''
_API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
@@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
- partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
- entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
-
- info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ partner_id, entry_id = mobj.group('partner_id', 'id')
+ ks = None
+ if partner_id and entry_id:
+ info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ else:
+ path, query = mobj.group('path', 'query')
+ if not path and not query:
+ raise ExtractorError('Invalid URL', expected=True)
+ params = {}
+ if query:
+ params = compat_parse_qs(query)
+ if path:
+ splitted_path = path.split('/')
+ params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
+ if 'wid' in params:
+ partner_id = params['wid'][0][1:]
+ elif 'p' in params:
+ partner_id = params['p'][0]
+ else:
+ raise ExtractorError('Invalid URL', expected=True)
+ if 'entry_id' in params:
+ entry_id = params['entry_id'][0]
+ info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
+ reference_id = params['flashvars[referenceId]'][0]
+ webpage = self._download_webpage(url, reference_id)
+ entry_data = self._parse_json(self._search_regex(
+ r'window\.kalturaIframePackageData\s*=\s*({.*});',
+ webpage, 'kalturaIframePackageData'),
+ reference_id)['entryResult']
+ info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
+ entry_id = info['id']
+ else:
+ raise ExtractorError('Invalid URL', expected=True)
+ ks = params.get('flashvars[ks]', [None])[0]
source_url = smuggled_data.get('source_url')
if source_url:
@@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):
else:
referrer = None
+ def sign_url(unsigned_url):
+ if ks:
+ unsigned_url += '/ks/%s' % ks
+ if referrer:
+ unsigned_url += '?referrer=%s' % referrer
+ return unsigned_url
+
formats = []
for f in flavor_assets:
# Continue if asset is not ready
if f['status'] != 2:
continue
- video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
- if referrer:
- video_url += '?referrer=%s' % referrer
+ video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
formats.append({
'format_id': '%(fileExt)s-%(bitrate)s' % f,
'ext': f.get('fileExt'),
@@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):
'width': int_or_none(f.get('width')),
'url': video_url,
})
- m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp')
- if referrer:
- m3u8_url += '?referrer=%s' % referrer
+ m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
formats.extend(self._extract_m3u8_formats(
m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 08a671fa8..61739efa7 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.khanacademy.org/video/one-time-pad',
- 'md5': '7021db7f2d47d4fff89b13177cb1e8f4',
+ 'md5': '7b391cce85e758fb94f763ddc1bbb979',
'info_dict': {
'id': 'one-time-pad',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'The one-time pad',
'description': 'The perfect cipher',
'duration': 176,
diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py
new file mode 100644
index 000000000..931f34c9b
--- /dev/null
+++ b/youtube_dl/extractor/kusi.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ timeconvert,
+ update_url_query,
+ xpath_text,
+)
+
+
+class KUSIIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))'
+ _TESTS = [{
+ 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold',
+ 'md5': 'f926e7684294cf8cb7bdf8858e1b3988',
+ 'info_dict': {
+ 'id': '12203019',
+ 'ext': 'mp4',
+ 'title': 'Turko Files: Case Closed! & Put On Hold!',
+ 'duration': 231.0,
+ 'upload_date': '20160210',
+ 'timestamp': 1455087571,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ }, {
+ 'url': 'http://kusi.com/video?clipId=12203019',
+ 'info_dict': {
+ 'id': '12203019',
+ 'ext': 'mp4',
+ 'title': 'Turko Files: Case Closed! & Put On Hold!',
+ 'duration': 231.0,
+ 'upload_date': '20160210',
+ 'timestamp': 1455087571,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ 'params': {
+ 'skip_download': True, # Same as previous one
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ clip_id = mobj.group('clipId')
+ video_id = clip_id or mobj.group('path')
+
+ webpage = self._download_webpage(url, video_id)
+
+ if clip_id is None:
+ video_id = clip_id = self._html_search_regex(
+ r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id')
+
+ affiliate_id = self._search_regex(
+ r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id')
+
+ # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf
+ xml_url = update_url_query('http://www.kusi.com/build.asp', {
+ 'buildtype': 'buildfeaturexmlrequest',
+ 'featureType': 'Clip',
+ 'featureid': clip_id,
+ 'affiliateno': affiliate_id,
+ 'clientgroupid': '1',
+ 'rnd': int(round(random.random() * 1000000)),
+ })
+
+ doc = self._download_xml(xml_url, video_id)
+
+ video_title = xpath_text(doc, 'HEADLINE', fatal=True)
+ duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
+ description = xpath_text(doc, 'ABSTRACT')
+ thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
+ createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
+
+ quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
+ formats = []
+ for quality in quality_options:
+ formats.append({
+ 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']),
+ 'height': int_or_none(quality.attrib.get('height')),
+ 'width': int_or_none(quality.attrib.get('width')),
+ 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'description': description,
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'timestamp': createtion_time,
+ }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index f641edef8..700e44b63 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -68,6 +68,7 @@ class KuwoIE(KuwoBaseIE):
'id': '6446136',
'ext': 'mp3',
'title': '心',
+ 'description': 'md5:b2ab6295d014005bfc607525bfc1e38a',
'creator': 'IU',
'upload_date': '20150518',
},
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/leeco.py
index 9665ece89..df47e88ba 100644
--- a/youtube_dl/extractor/letv.py
+++ b/youtube_dl/extractor/leeco.py
@@ -1,36 +1,39 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import datetime
+import hashlib
import re
import time
-import base64
-import hashlib
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_ord,
compat_str,
+ compat_urllib_parse,
)
from ..utils import (
determine_ext,
+ encode_data_uri,
ExtractorError,
+ int_or_none,
+ orderedSet,
parse_iso8601,
sanitized_Request,
- int_or_none,
str_or_none,
- encode_data_uri,
url_basename,
)
-class LetvIE(InfoExtractor):
+class LeIE(InfoExtractor):
IE_DESC = '乐视网'
- _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
+ _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html'
+
+ _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
_TESTS = [{
- 'url': 'http://www.letv.com/ptv/vplay/22005890.html',
+ 'url': 'http://www.le.com/ptv/vplay/22005890.html',
'md5': 'edadcfe5406976f42f9f266057ee5e40',
'info_dict': {
'id': '22005890',
@@ -42,7 +45,7 @@ class LetvIE(InfoExtractor):
'hls_prefer_native': True,
},
}, {
- 'url': 'http://www.letv.com/ptv/vplay/1415246.html',
+ 'url': 'http://www.le.com/ptv/vplay/1415246.html',
'info_dict': {
'id': '1415246',
'ext': 'mp4',
@@ -54,7 +57,7 @@ class LetvIE(InfoExtractor):
},
}, {
'note': 'This video is available only in Mainland China, thus a proxy is needed',
- 'url': 'http://www.letv.com/ptv/vplay/1118082.html',
+ 'url': 'http://www.le.com/ptv/vplay/1118082.html',
'md5': '2424c74948a62e5f31988438979c5ad1',
'info_dict': {
'id': '1118082',
@@ -94,17 +97,16 @@ class LetvIE(InfoExtractor):
return encrypted_data
encrypted_data = encrypted_data[5:]
- _loc4_ = bytearray()
- while encrypted_data:
- b = compat_ord(encrypted_data[0])
- _loc4_.extend([b // 16, b & 0x0f])
- encrypted_data = encrypted_data[1:]
+ _loc4_ = bytearray(2 * len(encrypted_data))
+ for idx, val in enumerate(encrypted_data):
+ b = compat_ord(val)
+ _loc4_[2 * idx] = b // 16
+ _loc4_[2 * idx + 1] = b % 16
idx = len(_loc4_) - 11
_loc4_ = _loc4_[idx:] + _loc4_[:idx]
- _loc7_ = bytearray()
- while _loc4_:
- _loc7_.append(_loc4_[0] * 16 + _loc4_[1])
- _loc4_ = _loc4_[2:]
+ _loc7_ = bytearray(len(encrypted_data))
+ for i in range(len(encrypted_data)):
+ _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
return bytes(_loc7_)
@@ -117,10 +119,10 @@ class LetvIE(InfoExtractor):
'splatid': 101,
'format': 1,
'tkey': self.calc_time_key(int(time.time())),
- 'domain': 'www.letv.com'
+ 'domain': 'www.le.com'
}
play_json_req = sanitized_Request(
- 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
+ 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
)
cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
if cn_verification_proxy:
@@ -193,26 +195,51 @@ class LetvIE(InfoExtractor):
}
-class LetvTvIE(InfoExtractor):
- _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html'
+class LePlaylistIE(InfoExtractor):
+ _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)'
+
_TESTS = [{
- 'url': 'http://www.letv.com/tv/46177.html',
+ 'url': 'http://www.le.com/tv/46177.html',
'info_dict': {
'id': '46177',
'title': '美人天下',
'description': 'md5:395666ff41b44080396e59570dbac01c'
},
'playlist_count': 35
+ }, {
+ 'url': 'http://tv.le.com/izt/wuzetian/index.html',
+ 'info_dict': {
+ 'id': 'wuzetian',
+ 'title': '武媚娘传奇',
+ 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
+ },
+ # This playlist contains some extra videos other than the drama itself
+ 'playlist_mincount': 96
+ }, {
+ 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
+ # This series is moved to http://www.le.com/tv/10005297.html
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.le.com/comic/92063.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
+ 'only_matching': True,
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
+
def _real_extract(self, url):
playlist_id = self._match_id(url)
page = self._download_webpage(url, playlist_id)
- media_urls = list(set(re.findall(
- r'http://www.letv.com/ptv/vplay/\d+.html', page)))
- entries = [self.url_result(media_url, ie='Letv')
- for media_url in media_urls]
+ # Currently old domain names are still used in playlists
+ media_ids = orderedSet(re.findall(
+ r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
+ entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
+ for media_id in media_ids]
title = self._html_search_meta('keywords', page,
fatal=False).split(',')[0]
@@ -222,31 +249,9 @@ class LetvTvIE(InfoExtractor):
playlist_description=description)
-class LetvPlaylistIE(LetvTvIE):
- _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html'
- _TESTS = [{
- 'url': 'http://tv.letv.com/izt/wuzetian/index.html',
- 'info_dict': {
- 'id': 'wuzetian',
- 'title': '武媚娘传奇',
- 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
- },
- # This playlist contains some extra videos other than the drama itself
- 'playlist_mincount': 96
- }, {
- 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml',
- 'info_dict': {
- 'id': 'lswjzzjc',
- # The title should be "劲舞青春", but I can't find a simple way to
- # determine the playlist title
- 'title': '乐视午间自制剧场',
- 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489'
- },
- 'playlist_mincount': 7
- }]
-
-
class LetvCloudIE(InfoExtractor):
+ # Most of *.letv.com is changed to *.le.com on 2016/01/02
+ # but yuntv.letv.com is kept, so also keep the extractor name
IE_DESC = '乐视云'
_VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
@@ -327,7 +332,7 @@ class LetvCloudIE(InfoExtractor):
formats.append({
'url': url,
'ext': determine_ext(decoded_url),
- 'format_id': int_or_none(play_url.get('vtype')),
+ 'format_id': str_or_none(play_url.get('vtype')),
'format_note': str_or_none(play_url.get('definition')),
'width': int_or_none(play_url.get('vwidth')),
'height': int_or_none(play_url.get('vheight')),
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index f8cbca7b3..a8fd639cc 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -20,18 +20,18 @@ class LifeNewsIE(InfoExtractor):
_VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://lifenews.ru/news/126342',
- 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
+ # single video embedded via video/source
+ 'url': 'http://lifenews.ru/news/98736',
+ 'md5': '77c95eaefaca216e32a76a343ad89d23',
'info_dict': {
- 'id': '126342',
+ 'id': '98736',
'ext': 'mp4',
- 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
- 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
- 'thumbnail': 're:http://.*\.jpg',
- 'upload_date': '20140130',
+ 'title': 'Мужчина нашел дома архив оборонного завода',
+ 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+ 'upload_date': '20120805',
}
}, {
- # video in <iframe>
+ # single video embedded via iframe
'url': 'http://lifenews.ru/news/152125',
'md5': '77d19a6f0886cd76bdbf44b4d971a273',
'info_dict': {
@@ -42,15 +42,33 @@ class LifeNewsIE(InfoExtractor):
'upload_date': '20150402',
}
}, {
+ # two videos embedded via iframe
'url': 'http://lifenews.ru/news/153461',
- 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
'info_dict': {
'id': '153461',
- 'ext': 'mp4',
'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
'upload_date': '20150505',
- }
+ },
+ 'playlist': [{
+ 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+ 'info_dict': {
+ 'id': '153461-video1',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'upload_date': '20150505',
+ },
+ }, {
+ 'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322',
+ 'info_dict': {
+ 'id': '153461-video2',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'upload_date': '20150505',
+ },
+ }],
}, {
'url': 'http://lifenews.ru/video/13035',
'only_matching': True,
@@ -65,10 +83,14 @@ class LifeNewsIE(InfoExtractor):
'http://lifenews.ru/%s/%s' % (section, video_id),
video_id, 'Downloading page')
- videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
- iframe_link = self._html_search_regex(
- '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
- if not videos and not iframe_link:
+ video_urls = re.findall(
+ r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
+
+ iframe_links = re.findall(
+ r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']',
+ webpage)
+
+ if not video_urls and not iframe_links:
raise ExtractorError('No media links available for %s' % video_id)
title = remove_end(
@@ -95,31 +117,44 @@ class LifeNewsIE(InfoExtractor):
'upload_date': upload_date,
}
- def make_entry(video_id, media, video_number=None):
+ def make_entry(video_id, video_url, index=None):
cur_info = dict(common_info)
cur_info.update({
- 'id': video_id,
- 'url': media[1],
- 'thumbnail': media[0],
- 'title': title if video_number is None else '%s-video%s' % (title, video_number),
+ 'id': video_id if not index else '%s-video%s' % (video_id, index),
+ 'url': video_url,
+ 'title': title if not index else '%s (Видео %s)' % (title, index),
})
return cur_info
- if iframe_link:
- iframe_link = self._proto_relative_url(iframe_link, 'http:')
- cur_info = dict(common_info)
- cur_info.update({
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': title,
- 'url': iframe_link,
- })
+ def make_video_entry(video_id, video_url, index=None):
+ video_url = compat_urlparse.urljoin(url, video_url)
+ return make_entry(video_id, video_url, index)
+
+ def make_iframe_entry(video_id, video_url, index=None):
+ video_url = self._proto_relative_url(video_url, 'http:')
+ cur_info = make_entry(video_id, video_url, index)
+ cur_info['_type'] = 'url_transparent'
return cur_info
- if len(videos) == 1:
- return make_entry(video_id, videos[0])
- else:
- return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+ if len(video_urls) == 1 and not iframe_links:
+ return make_video_entry(video_id, video_urls[0])
+
+ if len(iframe_links) == 1 and not video_urls:
+ return make_iframe_entry(video_id, iframe_links[0])
+
+ entries = []
+
+ if video_urls:
+ for num, video_url in enumerate(video_urls, 1):
+ entries.append(make_video_entry(video_id, video_url, num))
+
+ if iframe_links:
+ for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1):
+ entries.append(make_iframe_entry(video_id, iframe_link, num))
+
+ playlist = common_info.copy()
+ playlist.update(self.playlist_result(entries, video_id, title, description))
+ return playlist
class LifeEmbedIE(InfoExtractor):
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 38fb3d9e4..eada7c299 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -14,6 +14,7 @@ from ..utils import (
xpath_with_ns,
xpath_text,
orderedSet,
+ update_url_query,
int_or_none,
float_or_none,
parse_iso8601,
@@ -64,7 +65,7 @@ class LivestreamIE(InfoExtractor):
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base_ele = find_xpath_attr(
smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
- base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/'
+ base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
formats = []
video_nodes = smil.findall(self._xpath_ns('.//video', namespace))
@@ -72,7 +73,10 @@ class LivestreamIE(InfoExtractor):
for vn in video_nodes:
tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)
furl = (
- '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src']))
+ update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), {
+ 'v': '3.0.3',
+ 'fp': 'WIN% 14,0,0,145',
+ }))
if 'clipBegin' in vn.attrib:
furl += '&ssek=' + vn.attrib['clipBegin']
formats.append({
diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py
new file mode 100644
index 000000000..f5d00e61d
--- /dev/null
+++ b/youtube_dl/extractor/makerschannel.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MakersChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849',
+ 'md5': '624a512c6969236b5967bf9286345ad1',
+ 'info_dict': {
+ 'id': '849',
+ 'ext': 'mp4',
+ 'title': 'Landing a bus on a plane is an epic win',
+ 'uploader': 'ZoomIn',
+ 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139',
+ }
+ }
+
+ def _real_extract(self, url):
+ id_type, url_id = re.match(self._VALID_URL, url).groups()
+ webpage = self._download_webpage(url, url_id)
+ video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
+
+ def extract_data_val(attr, fatal=False):
+ return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal)
+ minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'minoto:%s' % minoto_id,
+ 'id': extract_data_val('video-id', True),
+ 'title': extract_data_val('title', True),
+ 'description': extract_data_val('description'),
+ 'thumbnail': extract_data_val('image'),
+ 'uploader': extract_data_val('channel'),
+ }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 425fc9e2a..2338e7f96 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -14,7 +14,7 @@ from ..utils import (
class MDRIE(InfoExtractor):
IE_DESC = 'MDR.DE and KiKA'
- _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
+ _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P<id>\d+)(?:_.+?)?\.html'
_TESTS = [{
# MDR regularly deletes its videos
@@ -60,6 +60,9 @@ class MDRIE(InfoExtractor):
}, {
'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -68,8 +71,8 @@ class MDRIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
data_url = self._search_regex(
- r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
- webpage, 'data url', group='url')
+ r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1',
+ webpage, 'data url', default=None, group='url').replace('\/', '/')
doc = self._download_xml(
compat_urlparse.urljoin(url, data_url), video_id)
diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py
new file mode 100644
index 000000000..959a10589
--- /dev/null
+++ b/youtube_dl/extractor/minoto.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MinotoIE(InfoExtractor):
+ _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ player_id = mobj.group('player_id') or '1'
+ video_id = mobj.group('id')
+ video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
+ video_metadata = video_data['video-metadata']
+ formats = []
+ for fmt in video_data['video-files']:
+ fmt_url = fmt.get('url')
+ if not fmt_url:
+ continue
+ container = fmt.get('container')
+ if container == 'hls':
+ formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ else:
+ fmt_profile = fmt.get('profile') or {}
+ f = {
+ 'format_id': fmt_profile.get('name-short'),
+ 'format_note': fmt_profile.get('name'),
+ 'url': fmt_url,
+ 'container': container,
+ 'tbr': int_or_none(fmt.get('bitrate')),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'width': int_or_none(fmt.get('width')),
+ 'height': int_or_none(fmt.get('height')),
+ }
+ codecs = fmt.get('codecs')
+ if codecs:
+ codecs = codecs.split(',')
+ if len(codecs) == 2:
+ f.update({
+ 'vcodec': codecs[0],
+ 'acodec': codecs[1],
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_metadata['title'],
+ 'description': video_metadata.get('description'),
+ 'thumbnail': video_metadata.get('video-poster', {}).get('url'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 29ca45778..819c1b90b 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -99,7 +99,7 @@ class OCWMITIE(InfoExtractor):
'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
'info_dict': {
'id': 'EObHWIEKGjA',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
'upload_date': '20121109',
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index c2b7ed9ab..101497118 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
HEADRequest,
+ parse_count,
str_to_int,
)
@@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor):
uploader_id = self._search_regex(
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
- like_count = str_to_int(self._search_regex(
- r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
+ like_count = parse_count(self._search_regex(
+ r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
webpage, 'like count', fatal=False))
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
index 97d5da626..0b4787c1d 100644
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
str_to_int,
unified_strdate,
)
@@ -12,55 +13,62 @@ from ..utils import (
class MotherlessIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
- _TESTS = [
- {
- 'url': 'http://motherless.com/AC3FFE1',
- 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
- 'info_dict': {
- 'id': 'AC3FFE1',
- 'ext': 'mp4',
- 'title': 'Fucked in the ass while playing PS3',
- 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
- 'upload_date': '20100913',
- 'uploader_id': 'famouslyfuckedup',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
- },
- {
- 'url': 'http://motherless.com/532291B',
- 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
- 'info_dict': {
- 'id': '532291B',
- 'ext': 'mp4',
- 'title': 'Amazing girl playing the omegle game, PERFECT!',
- 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'],
- 'upload_date': '20140622',
- 'uploader_id': 'Sulivana7x',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
+ _TESTS = [{
+ 'url': 'http://motherless.com/AC3FFE1',
+ 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
+ 'info_dict': {
+ 'id': 'AC3FFE1',
+ 'ext': 'mp4',
+ 'title': 'Fucked in the ass while playing PS3',
+ 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
+ 'upload_date': '20100913',
+ 'uploader_id': 'famouslyfuckedup',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'http://motherless.com/532291B',
+ 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
+ 'info_dict': {
+ 'id': '532291B',
+ 'ext': 'mp4',
+ 'title': 'Amazing girl playing the omegle game, PERFECT!',
+ 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
+ 'game', 'hairy'],
+ 'upload_date': '20140622',
+ 'uploader_id': 'Sulivana7x',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
},
- {
- 'url': 'http://motherless.com/g/cosplay/633979F',
- 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
- 'info_dict': {
- 'id': '633979F',
- 'ext': 'mp4',
- 'title': 'Turtlette',
- 'categories': ['superheroine heroine superher'],
- 'upload_date': '20140827',
- 'uploader_id': 'shade0230',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
+ 'skip': '404',
+ }, {
+ 'url': 'http://motherless.com/g/cosplay/633979F',
+ 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+ 'info_dict': {
+ 'id': '633979F',
+ 'ext': 'mp4',
+ 'title': 'Turtlette',
+ 'categories': ['superheroine heroine superher'],
+ 'upload_date': '20140827',
+ 'uploader_id': 'shade0230',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
}
- ]
+ }, {
+ # no keywords
+ 'url': 'http://motherless.com/8B4BBC1',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ if any(p in webpage for p in (
+ '<title>404 - MOTHERLESS.COM<',
+ ">The page you're looking for cannot be found.<")):
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
title = self._html_search_regex(
r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
video_url = self._html_search_regex(
@@ -86,7 +94,7 @@ class MotherlessIE(InfoExtractor):
r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
webpage, 'uploader_id')
- categories = self._html_search_meta('keywords', webpage)
+ categories = self._html_search_meta('keywords', webpage, default=None)
if categories:
categories = [cat.strip() for cat in categories.split(',')]
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index e8bb527b8..ed068365d 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -11,6 +11,7 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
+ float_or_none,
HEADRequest,
sanitized_Request,
unescapeHTML,
@@ -110,7 +111,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
uri = itemdoc.find('guid').text
video_id = self._id_from_uri(uri)
self.report_extraction(video_id)
- mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+ content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content')))
+ mediagen_url = content_el.attrib['url']
# Remove the templates, like &device={device}
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
if 'acceptMethods' not in mediagen_url:
@@ -165,6 +167,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
'id': video_id,
'thumbnail': self._get_thumbnail_url(uri, itemdoc),
'description': description,
+ 'duration': float_or_none(content_el.attrib.get('duration')),
}
def _get_feed_query(self, uri):
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index a071378b6..3e2b3e599 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -1,18 +1,26 @@
from __future__ import unicode_literals
+import functools
+import os.path
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urlparse,
+)
from ..utils import (
- parse_duration,
int_or_none,
+ OnDemandPagedList,
+ parse_duration,
+ remove_start,
xpath_text,
xpath_attr,
)
class NBAIE(InfoExtractor):
- _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
_TESTS = [{
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
'md5': '9e7729d3010a9c71506fd1248f74e4f4',
@@ -44,14 +52,101 @@ class NBAIE(InfoExtractor):
'timestamp': 1432134543,
'upload_date': '20150520',
}
+ }, {
+ 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake',
+ 'info_dict': {
+ 'id': '1455672027478-Doc_Feb16_720',
+ 'ext': 'mp4',
+ 'title': 'Practice: Doc Rivers - 2/16/16',
+ 'description': 'Head Coach Doc Rivers addresses the media following practice.',
+ 'upload_date': '20160217',
+ 'timestamp': 1455672000,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+ 'info_dict': {
+ 'id': 'timberwolves',
+ 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+ },
+ 'playlist_count': 30,
+ 'params': {
+ # Download the whole playlist takes too long time
+ 'playlist_items': '1-30',
+ },
+ }, {
+ 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+ 'info_dict': {
+ 'id': 'Wigginsmp4',
+ 'ext': 'mp4',
+ 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+ 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.',
+ 'upload_date': '20141212',
+ 'timestamp': 1418418600,
+ },
+ 'params': {
+ 'noplaylist': True,
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
+ _PAGE_SIZE = 30
+
+ def _fetch_page(self, team, video_id, page):
+ search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse.urlencode({
+ 'type': 'teamvideo',
+ 'start': page * self._PAGE_SIZE + 1,
+ 'npp': (page + 1) * self._PAGE_SIZE + 1,
+ 'sort': 'recent',
+ 'output': 'json',
+ 'site': team,
+ })
+ results = self._download_json(
+ search_url, video_id, note='Download page %d of playlist data' % page)['results'][0]
+ for item in results:
+ yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url']))
+
+ def _extract_playlist(self, orig_path, video_id, webpage):
+ team = orig_path.split('/')[0]
+
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video because of --no-playlist')
+ video_path = self._search_regex(
+ r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path')
+ video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path)
+ return self.url_result(video_url)
+
+ self.to_screen('Downloading playlist - add --no-playlist to just download video')
+ playlist_title = self._og_search_title(webpage, fatal=False)
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, team, video_id),
+ self._PAGE_SIZE, use_cache=True)
+
+ return self.playlist_result(entries, team, playlist_title)
+
def _real_extract(self, url):
path, video_id = re.match(self._VALID_URL, url).groups()
+ orig_path = path
if path.startswith('nba/'):
path = path[3:]
+
+ if 'video/' not in path:
+ webpage = self._download_webpage(url, video_id)
+ path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/')
+
+ if path == '{{id}}':
+ return self._extract_playlist(orig_path, video_id, webpage)
+
+ # See prepareContentId() of pkgCvp.js
+ if path.startswith('video/teams'):
+ path = 'video/channels/proxy/' + path[6:]
+
video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id)
- video_id = xpath_text(video_info, 'slug')
+ video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0]
title = xpath_text(video_info, 'headline')
description = xpath_text(video_info, 'description')
duration = parse_duration(xpath_text(video_info, 'length'))
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index a126f5054..3b21fbd4d 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_urllib_parse_unquote,
+)
from ..utils import (
determine_ext,
ExtractorError,
@@ -87,7 +90,7 @@ class NRKIE(InfoExtractor):
class NRKPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -126,6 +129,37 @@ class NRKPlaylistIE(InfoExtractor):
entries, playlist_id, playlist_title, playlist_description)
+class NRKSkoleIE(InfoExtractor):
+ IE_DESC = 'NRK Skole'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532',
+ 'md5': '04cd85877cc1913bce73c5d28a47e00f',
+ 'info_dict': {
+ 'id': '6021',
+ 'ext': 'flv',
+ 'title': 'Genetikk og eneggede tvillinger',
+ 'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
+ 'duration': 399,
+ },
+ }, {
+ 'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = compat_urllib_parse_unquote(self._match_id(url))
+
+ webpage = self._download_webpage(url, video_id)
+
+ nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')
+ return self.url_result('nrk:%s' % nrk_id)
+
+
class NRKTVIE(InfoExtractor):
IE_DESC = 'NRK TV and NRK Radio'
_VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index cca012953..f43e3a146 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -338,6 +338,21 @@ class PBSIE(InfoExtractor):
},
},
{
+ # Serves hd only via wigget/partnerplayer page
+ 'url': 'http://www.pbs.org/video/2365641075/',
+ 'info_dict': {
+ 'id': '2365641075',
+ 'ext': 'mp4',
+ 'title': 'FRONTLINE - Netanyahu at War',
+ 'duration': 6852,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'formats': 'mincount:8',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
},
@@ -437,34 +452,54 @@ class PBSIE(InfoExtractor):
for vid_id in video_id]
return self.playlist_result(entries, display_id)
+ info = None
+ redirects = []
+ redirect_urls = set()
+
+ def extract_redirect_urls(info):
+ for encoding_name in ('recommended_encoding', 'alternate_encoding'):
+ redirect = info.get(encoding_name)
+ if not redirect:
+ continue
+ redirect_url = redirect.get('url')
+ if redirect_url and redirect_url not in redirect_urls:
+ redirects.append(redirect)
+ redirect_urls.add(redirect_url)
+
try:
- info = self._download_json(
+ video_info = self._download_json(
'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
display_id, 'Downloading video info JSON')
+ extract_redirect_urls(video_info)
+ info = video_info
except ExtractorError as e:
+ # videoInfo API may not work for some videos
if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404:
raise
- # videoInfo API may not work for some videos, fallback to portalplayer API
+
+ # Player pages may also serve different qualities
+ for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
- 'http://player.pbs.org/portalplayer/%s' % video_id, display_id)
- info = self._parse_json(
- self._search_regex(
- r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
- player, 'video data', default='{}'),
- display_id, transform_source=js_to_json, fatal=False)
+ 'http://player.pbs.org/%s/%s' % (page, video_id),
+ display_id, 'Downloading %s page' % page, fatal=False)
+ if player:
+ video_info = self._parse_json(
+ self._search_regex(
+ r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+ player, '%s video data' % page, default='{}'),
+ display_id, transform_source=js_to_json, fatal=False)
+ if video_info:
+ extract_redirect_urls(video_info)
+ if not info:
+ info = video_info
formats = []
- for encoding_name in ('recommended_encoding', 'alternate_encoding'):
- redirect = info.get(encoding_name)
- if not redirect:
- continue
- redirect_url = redirect.get('url')
- if not redirect_url:
- continue
+ for num, redirect in enumerate(redirects):
+ redirect_id = redirect.get('eeid')
redirect_info = self._download_json(
- redirect_url + '?format=json', display_id,
- 'Downloading %s video url info' % encoding_name)
+ '%s?format=json' % redirect['url'], display_id,
+ 'Downloading %s video url info' % (redirect_id or num))
if redirect_info['status'] == 'error':
raise ExtractorError(
@@ -483,8 +518,9 @@ class PBSIE(InfoExtractor):
else:
formats.append({
'url': format_url,
- 'format_id': redirect.get('eeid'),
+ 'format_id': redirect_id,
})
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
rating_str = info.get('rating')
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
index 6d5732d45..30a5f2de4 100644
--- a/youtube_dl/extractor/pyvideo.py
+++ b/youtube_dl/extractor/pyvideo.py
@@ -12,14 +12,14 @@ class PyvideoIE(InfoExtractor):
_TESTS = [
{
'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
- 'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+ 'md5': '520915673e53a5c5d487c36e0c4d85b5',
'info_dict': {
'id': '24_4WWkSmNo',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Become a logging expert in 30 minutes',
'description': 'md5:9665350d466c67fb5b1598de379021f7',
'upload_date': '20130320',
- 'uploader': 'NextDayVideo',
+ 'uploader': 'Next Day Video',
'uploader_id': 'NextDayVideo',
},
'add_ie': ['Youtube'],
diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py
index b1b8800b9..99979ebe1 100644
--- a/youtube_dl/extractor/revision3.py
+++ b/youtube_dl/extractor/revision3.py
@@ -19,7 +19,7 @@ class Revision3IE(InfoExtractor):
'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
'md5': 'd94a72d85d0a829766de4deb8daaf7df',
'info_dict': {
- 'id': '73034',
+ 'id': '71089',
'display_id': 'technobuffalo/5-google-predictions-for-2016',
'ext': 'webm',
'title': '5 Google Predictions for 2016',
@@ -31,6 +31,7 @@ class Revision3IE(InfoExtractor):
'uploader_id': 'technobuffalo',
}
}, {
+ # Show
'url': 'http://testtube.com/brainstuff',
'info_dict': {
'id': '251',
@@ -41,7 +42,7 @@ class Revision3IE(InfoExtractor):
}, {
'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
'info_dict': {
- 'id': '60163',
+ 'id': '58227',
'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
'duration': 275,
'ext': 'webm',
@@ -52,18 +53,72 @@ class Revision3IE(InfoExtractor):
'uploader': 'DNews',
'uploader_id': 'dnews',
},
+ }, {
+ 'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
+ 'info_dict': {
+ 'id': '71618',
+ 'ext': 'mp4',
+ 'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
+ 'title': 'The Israel-Palestine Conflict Explained in Ten Minutes',
+ 'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start',
+ 'uploader': 'Editors\' Picks',
+ 'uploader_id': 'tt-editors-picks',
+ 'timestamp': 1453309200,
+ 'upload_date': '20160120',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ # Tag
+ 'url': 'http://testtube.com/tech-news',
+ 'info_dict': {
+ 'id': '21018',
+ 'title': 'tech news',
+ },
+ 'playlist_mincount': 9,
}]
_PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
_API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
def _real_extract(self, url):
domain, display_id = re.match(self._VALID_URL, url).groups()
+ site = domain.split('.')[0]
page_info = self._download_json(
self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id)
- if page_info['data']['type'] == 'episode':
- episode_data = page_info['data']
- video_id = compat_str(episode_data['video']['data']['id'])
+ page_data = page_info['data']
+ page_type = page_data['type']
+ if page_type in ('episode', 'embed'):
+ show_data = page_data['show']['data']
+ page_id = compat_str(page_data['id'])
+ video_id = compat_str(page_data['video']['data']['id'])
+
+ preference = qualities(['mini', 'small', 'medium', 'large'])
+ thumbnails = [{
+ 'url': image_url,
+ 'id': image_id,
+ 'preference': preference(image_id)
+ } for image_id, image_url in page_data.get('images', {}).items()]
+
+ info = {
+ 'id': page_id,
+ 'display_id': display_id,
+ 'title': unescapeHTML(page_data['name']),
+ 'description': unescapeHTML(page_data.get('summary')),
+ 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '),
+ 'author': page_data.get('author'),
+ 'uploader': show_data.get('name'),
+ 'uploader_id': show_data.get('slug'),
+ 'thumbnails': thumbnails,
+ 'extractor_key': site,
+ }
+
+ if page_type == 'embed':
+ info.update({
+ '_type': 'url_transparent',
+ 'url': page_data['video']['data']['embed'],
+ })
+ return info
+
video_data = self._download_json(
'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
video_id)['items'][0]
@@ -84,36 +139,30 @@ class Revision3IE(InfoExtractor):
})
self._sort_formats(formats)
- preference = qualities(['mini', 'small', 'medium', 'large'])
- thumbnails = [{
- 'url': image_url,
- 'id': image_id,
- 'preference': preference(image_id)
- } for image_id, image_url in video_data.get('images', {}).items()]
-
- return {
- 'id': video_id,
- 'display_id': display_id,
+ info.update({
'title': unescapeHTML(video_data['title']),
'description': unescapeHTML(video_data.get('summary')),
- 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '),
- 'author': episode_data.get('author'),
'uploader': video_data.get('show', {}).get('name'),
'uploader_id': video_data.get('show', {}).get('slug'),
'duration': int_or_none(video_data.get('duration')),
- 'thumbnails': thumbnails,
'formats': formats,
- }
+ })
+ return info
else:
- show_data = page_info['show']['data']
+ list_data = page_info[page_type]['data']
episodes_data = page_info['episodes']['data']
num_episodes = page_info['meta']['totalEpisodes']
processed_episodes = 0
entries = []
page_num = 1
while True:
- entries.extend([self.url_result(
- 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data])
+ entries.extend([{
+ '_type': 'url',
+ 'url': 'http://%s%s' % (domain, episode['path']),
+ 'id': compat_str(episode['id']),
+ 'ie_key': 'Revision3',
+ 'extractor_key': site,
+ } for episode in episodes_data])
processed_episodes += len(episodes_data)
if processed_episodes == num_episodes:
break
@@ -123,5 +172,5 @@ class Revision3IE(InfoExtractor):
display_id)['episodes']['data']
return self.playlist_result(
- entries, compat_str(show_data['id']),
- show_data.get('name'), show_data.get('summary'))
+ entries, compat_str(list_data['id']),
+ list_data.get('name'), list_data.get('summary'))
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 603d7bd00..8a8c5d2a0 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -10,6 +10,7 @@ from ..utils import (
ExtractorError,
float_or_none,
remove_end,
+ remove_start,
sanitized_Request,
std_headers,
struct_unpack,
@@ -178,14 +179,14 @@ class RTVEInfantilIE(InfoExtractor):
class RTVELiveIE(InfoExtractor):
IE_NAME = 'rtve.es:live'
IE_DESC = 'RTVE.es live streams'
- _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)'
+ _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
_TESTS = [{
- 'url': 'http://www.rtve.es/noticias/directo-la-1/',
+ 'url': 'http://www.rtve.es/directo/la-1/',
'info_dict': {
- 'id': 'directo-la-1',
- 'ext': 'flv',
- 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+ 'id': 'la-1',
+ 'ext': 'mp4',
+ 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
},
'params': {
'skip_download': 'live stream',
@@ -198,23 +199,20 @@ class RTVELiveIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- player_url = self._search_regex(
- r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL')
- title = remove_end(self._og_search_title(webpage), ' en directo')
+ title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
+ title = remove_start(title, 'Estoy viendo ')
title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
vidplayer_id = self._search_regex(
- r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+ r'playerId=player([0-9]+)', webpage, 'internal video ID')
+ png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id
png = self._download_webpage(png_url, video_id, 'Downloading url information')
- video_url = _decrypt_url(png)
+ m3u8_url = _decrypt_url(png)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
return {
'id': video_id,
- 'ext': 'flv',
'title': title,
- 'url': video_url,
- 'app': 'rtve-live-live?ovpfv=2.1.2',
- 'player_url': player_url,
- 'rtmp_live': True,
+ 'formats': formats,
+ 'is_live': True,
}
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
index 7de7b7273..256396bb8 100644
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@@ -4,14 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
from ..utils import (
ExtractorError,
sanitized_Request,
- smuggle_url,
std_headers,
urlencode_postdata,
+ update_url_query,
)
@@ -20,28 +19,30 @@ class SafariBaseIE(InfoExtractor):
_SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
_NETRC_MACHINE = 'safari'
- _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+ _API_BASE = 'https://www.safaribooksonline.com/api/v1'
_API_FORMAT = 'json'
LOGGED_IN = False
def _real_initialize(self):
- # We only need to log in once for courses or individual videos
- if not self.LOGGED_IN:
- self._login()
- SafariBaseIE.LOGGED_IN = True
+ self._login()
def _login(self):
+ # We only need to log in once for courses or individual videos
+ if self.LOGGED_IN:
+ return
+
(username, password) = self._get_login_info()
if username is None:
- self.raise_login_required('safaribooksonline.com account is required')
+ return
- headers = std_headers
+ headers = std_headers.copy()
if 'Referer' not in headers:
headers['Referer'] = self._LOGIN_URL
+ login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)
login_page = self._download_webpage(
- self._LOGIN_URL, None,
+ login_page_request, None,
'Downloading login form')
csrf = self._html_search_regex(
@@ -66,6 +67,8 @@ class SafariBaseIE(InfoExtractor):
'Login failed; make sure your credentials are correct and try again.',
expected=True)
+ SafariBaseIE.LOGGED_IN = True
+
self.to_screen('Login successful')
@@ -85,13 +88,15 @@ class SafariIE(SafariBaseIE):
_TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
- 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+ 'md5': 'dcc5a425e79f2564148652616af1f2a3',
'info_dict': {
- 'id': '2842601850001',
+ 'id': '0_qbqx90ic',
'ext': 'mp4',
- 'title': 'Introduction',
+ 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
+ 'timestamp': 1437758058,
+ 'upload_date': '20150724',
+ 'uploader_id': 'stork',
},
- 'skip': 'Requires safaribooksonline account credentials',
}, {
'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
'only_matching': True,
@@ -106,15 +111,30 @@ class SafariIE(SafariBaseIE):
course_id = mobj.group('course_id')
part = mobj.group('part')
- webpage = self._download_webpage(
- '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
- part)
+ webpage = self._download_webpage(url, '%s/%s' % (course_id, part))
+ reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id')
+ partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id')
+ ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id')
+
+ query = {
+ 'wid': '_%s' % partner_id,
+ 'uiconf_id': ui_id,
+ 'flashvars[referenceId]': reference_id,
+ }
- bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if not bc_url:
- raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+ if self.LOGGED_IN:
+ kaltura_session = self._download_json(
+ '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
+ course_id, 'Downloading kaltura session JSON',
+ 'Unable to download kaltura session JSON', fatal=False)
+ if kaltura_session:
+ session = kaltura_session.get('session')
+ if session:
+ query['flashvars[ks]'] = session
- return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
+ return self.url_result(update_url_query(
+ 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
+ 'Kaltura')
class SafariCourseIE(SafariBaseIE):
@@ -140,7 +160,7 @@ class SafariCourseIE(SafariBaseIE):
course_id = self._match_id(url)
course_json = self._download_json(
- '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+ '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
course_id, 'Downloading course JSON')
if 'chapters' not in course_json:
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
index 2cf210e0d..44b0bbee6 100644
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -70,25 +70,27 @@ class ScreenwaveMediaIE(InfoExtractor):
formats = []
for source in sources:
- if source['type'] == 'hls':
- formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4'))
+ file_ = source.get('file')
+ if not file_:
+ continue
+ if source.get('type') == 'hls':
+ formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4'))
else:
- file_ = source.get('file')
- if not file_:
- continue
- format_label = source.get('label')
format_id = self._search_regex(
r'_(.+?)\.[^.]+$', file_, 'format id', default=None)
+ if not self._is_valid_url(file_, video_id, format_id or 'video'):
+ continue
+ format_label = source.get('label')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]', format_label, 'height', default=None))
formats.append({
- 'url': source['file'],
+ 'url': file_,
'format_id': format_id,
'format': format_label,
'ext': source.get('type'),
'height': height,
})
- self._sort_formats(formats)
+ self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py
index 6365a8779..a99b2a8e7 100644
--- a/youtube_dl/extractor/sexu.py
+++ b/youtube_dl/extractor/sexu.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -14,7 +12,7 @@ class SexuIE(InfoExtractor):
'id': '961791',
'ext': 'mp4',
'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
- 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54',
+ 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403',
'categories': list, # NSFW
'thumbnail': 're:https?://.*\.jpg$',
'age_limit': 18,
@@ -25,13 +23,18 @@ class SexuIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- quality_arr = self._search_regex(
- r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string')
+ jwvideo = self._parse_json(
+ self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'),
+ video_id)
+
+ sources = jwvideo['sources']
+
formats = [{
- 'url': fmt[0].replace('\\', ''),
- 'format_id': fmt[1],
- 'height': int(fmt[1][:3]),
- } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)]
+ 'url': source['file'].replace('\\', ''),
+ 'format_id': source.get('label'),
+ 'height': self._search_regex(
+ r'^(\d+)[pP]', source.get('label', ''), 'height', default=None),
+ } for source in sources if source.get('file')]
self._sort_formats(formats)
title = self._html_search_regex(
@@ -40,9 +43,7 @@ class SexuIE(InfoExtractor):
description = self._html_search_meta(
'description', webpage, 'description')
- thumbnail = self._html_search_regex(
- r'image:\s*"([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ thumbnail = jwvideo.get('image')
categories_str = self._html_search_meta(
'keywords', webpage, 'categories')
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
deleted file mode 100644
index ebb5d6ec0..000000000
--- a/youtube_dl/extractor/space.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
-from ..utils import RegexNotFoundError, ExtractorError
-
-
-class SpaceIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
- _TEST = {
- 'add_ie': ['BrightcoveLegacy'],
- 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
- 'info_dict': {
- 'id': '2780937028001',
- 'ext': 'mp4',
- 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
- 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
- 'uploader': 'TechMedia Networks',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- try:
- # Some videos require the playerKey field, which isn't define in
- # the BrightcoveExperience object
- brightcove_url = self._og_search_video_url(webpage)
- except RegexNotFoundError:
- # Other videos works fine with the info from the object
- brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if brightcove_url is None:
- raise ExtractorError(
- 'The webpage does not contain a video', expected=True)
- return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key())
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index a48d77c30..cf8851438 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -73,7 +73,7 @@ class TEDIE(InfoExtractor):
'add_ie': ['Youtube'],
'info_dict': {
'id': '_ZG8HBuDjgc',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Douglas Adams: Parrots the Universe and Everything',
'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
'uploader': 'University of California Television (UCTV)',
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 6890021cf..9ee844684 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -48,8 +48,6 @@ class TF1IE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
wat_id = self._html_search_regex(
- r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
+ r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1',
webpage, 'wat id', group='id')
- wat_info = self._download_json(
- 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
- return self.url_result(wat_info['media']['url'], 'Wat')
+ return self.url_result('wat:%s' % wat_id, 'Wat')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 755f816ff..9a57b49df 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -21,6 +21,8 @@ from ..utils import (
sanitized_Request,
unsmuggle_url,
xpath_with_ns,
+ mimetype2ext,
+ find_xpath_attr,
)
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -30,15 +32,11 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
class ThePlatformBaseIE(InfoExtractor):
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
meta = self._download_xml(smil_url, video_id, note=note)
- try:
- error_msg = next(
- n.attrib['abstract']
- for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
- except StopIteration:
- pass
- else:
- raise ExtractorError(error_msg, expected=True)
+ error_element = find_xpath_attr(
+ meta, _x('.//smil:ref'), 'src',
+ 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4')
+ if error_element is not None:
+ raise ExtractorError(error_element.attrib['abstract'], expected=True)
formats = self._parse_smil_formats(
meta, smil_url, video_id, namespace=default_ns,
@@ -68,7 +66,7 @@ class ThePlatformBaseIE(InfoExtractor):
for caption in captions:
lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
subtitles[lang] = [{
- 'ext': 'srt' if mime == 'text/srt' else 'ttml',
+ 'ext': mimetype2ext(mime),
'url': src,
}]
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index 49516abca..79f036fe4 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -71,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- display_id = mobj.group('display_id')
+ display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id
webpage = self._download_webpage(url, display_id)
@@ -117,7 +117,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
title = self._html_search_regex(
self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
- age_limit = self._rta_search(webpage)
+ age_limit = self._rta_search(webpage) or 18
duration = parse_duration(self._html_search_meta(
'duration', webpage, 'duration', default=None))
@@ -152,6 +152,36 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
}
+class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)'
+
+ _TITLE_REGEX = r'<title>([^<]+)</title>'
+
+ _TESTS = [{
+ 'url': 'https://player.tnaflix.com/video/6538',
+ 'info_dict': {
+ 'id': '6538',
+ 'display_id': '6538',
+ 'ext': 'mp4',
+ 'title': 'Educational xxx video',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://player.empflix.com/video/33051',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1',
+ webpage)]
+
+
class TNAFlixIE(TNAFlixNetworkBaseIE):
_VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 69882da63..958bf8fff 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -17,6 +17,7 @@ from ..utils import (
encode_dict,
ExtractorError,
int_or_none,
+ orderedSet,
parse_duration,
parse_iso8601,
sanitized_Request,
@@ -251,6 +252,7 @@ class TwitchVodIE(TwitchItemBaseIE):
self._USHER_BASE, item_id,
compat_urllib_parse.urlencode({
'allow_source': 'true',
+ 'allow_audio_only': 'true',
'allow_spectre': 'true',
'player': 'twitchweb',
'nauth': access_token['token'],
@@ -281,17 +283,36 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):
entries = []
offset = 0
limit = self._PAGE_LIMIT
+ broken_paging_detected = False
+ counter_override = None
for counter in itertools.count(1):
response = self._download_json(
self._PLAYLIST_URL % (channel_id, offset, limit),
- channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
+ channel_id,
+ 'Downloading %s videos JSON page %s'
+ % (self._PLAYLIST_TYPE, counter_override or counter))
page_entries = self._extract_playlist_page(response)
if not page_entries:
break
+ total = int_or_none(response.get('_total'))
+ # Since the beginning of March 2016 twitch's paging mechanism
+ # is completely broken on the twitch side. It simply ignores
+ # a limit and returns the whole offset number of videos.
+ # Working around by just requesting all videos at once.
+ if not broken_paging_detected and total and len(page_entries) > limit:
+ self.report_warning(
+ 'Twitch paging is broken on twitch side, requesting all videos at once',
+ channel_id)
+ broken_paging_detected = True
+ offset = total
+ counter_override = '(all at once)'
+ continue
entries.extend(page_entries)
+ if broken_paging_detected or total and len(page_entries) >= total:
+ break
offset += limit
return self.playlist_result(
- [self.url_result(entry) for entry in set(entries)],
+ [self.url_result(entry) for entry in orderedSet(entries)],
channel_id, channel_name)
def _extract_playlist_page(self, response):
@@ -411,6 +432,7 @@ class TwitchStreamIE(TwitchBaseIE):
query = {
'allow_source': 'true',
+ 'allow_audio_only': 'true',
'p': random.randint(1000000, 10000000),
'player': 'twitchweb',
'segment_preference': '4',
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index a161f046b..e70b2ab3c 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -10,21 +10,26 @@ from ..utils import (
remove_end,
int_or_none,
ExtractorError,
- sanitized_Request,
)
-class TwitterCardIE(InfoExtractor):
+class TwitterBaseIE(InfoExtractor):
+ def _get_vmap_video_url(self, vmap_url, video_id):
+ vmap_data = self._download_xml(vmap_url, video_id)
+ return xpath_text(vmap_data, './/MediaFile').strip()
+
+
+class TwitterCardIE(TwitterBaseIE):
IE_NAME = 'twitter:card'
- _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
- 'md5': '4fa26a35f9d1bf4b646590ba8e84be19',
+ # MD5 checksums are different in different places
'info_dict': {
'id': '560070183650213889',
'ext': 'mp4',
- 'title': 'TwitterCard',
+ 'title': 'Twitter Card',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 30.033,
}
@@ -35,14 +40,14 @@ class TwitterCardIE(InfoExtractor):
'info_dict': {
'id': '623160978427936768',
'ext': 'mp4',
- 'title': 'TwitterCard',
+ 'title': 'Twitter Card',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 80.155,
},
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
- 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+ 'md5': 'd4724ffe6d2437886d004fa5de1043b3',
'info_dict': {
'id': 'dq4Oj5quskI',
'ext': 'mp4',
@@ -62,69 +67,106 @@ class TwitterCardIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20151113',
'uploader_id': '1189339351084113920',
- 'uploader': '@ArsenalTerje',
- 'title': 'Vine by @ArsenalTerje',
+ 'uploader': 'ArsenalTerje',
+ 'title': 'Vine by ArsenalTerje',
},
'add_ie': ['Vine'],
- }
+ }, {
+ 'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
+ 'md5': '3846d0a07109b5ab622425449b59049d',
+ 'info_dict': {
+ 'id': '705235433198714880',
+ 'ext': 'mp4',
+ 'title': 'Twitter web player',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ },
]
def _real_extract(self, url):
video_id = self._match_id(url)
- # Different formats served for different User-Agents
- USER_AGENTS = [
- 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4
- 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm
- ]
-
config = None
formats = []
- for user_agent in USER_AGENTS:
- request = sanitized_Request(url)
- request.add_header('User-Agent', user_agent)
- webpage = self._download_webpage(request, video_id)
-
- iframe_url = self._html_search_regex(
- r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
- webpage, 'video iframe', default=None)
- if iframe_url:
- return self.url_result(iframe_url)
-
- config = self._parse_json(self._html_search_regex(
- r'data-player-config="([^"]+)"', webpage, 'data player config'),
- video_id)
- if 'playlist' not in config:
- if 'vmapUrl' in config:
- vmap_data = self._download_xml(config['vmapUrl'], video_id)
- video_url = xpath_text(vmap_data, './/MediaFile').strip()
- formats.append({
- 'url': video_url,
- })
- break # same video regardless of UA
- continue
-
- video_url = config['playlist'][0]['source']
+ duration = None
- f = {
- 'url': video_url,
- }
+ webpage = self._download_webpage(url, video_id)
+
+ iframe_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
+ webpage, 'video iframe', default=None)
+ if iframe_url:
+ return self.url_result(iframe_url)
+
+ config = self._parse_json(self._html_search_regex(
+ r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'),
+ video_id)
+ def _search_dimensions_in_video_url(a_format, video_url):
m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
if m:
- f.update({
+ a_format.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
+
+ playlist = config.get('playlist')
+ if playlist:
+ video_url = playlist[0]['source']
+
+ f = {
+ 'url': video_url,
+ }
+
+ _search_dimensions_in_video_url(f, video_url)
+
formats.append(f)
+
+ vmap_url = config.get('vmapUrl') or config.get('vmap_url')
+ if vmap_url:
+ formats.append({
+ 'url': self._get_vmap_video_url(vmap_url, video_id),
+ })
+
+ media_info = None
+
+ for entity in config.get('status', {}).get('entities', []):
+ if 'mediaInfo' in entity:
+ media_info = entity['mediaInfo']
+
+ if media_info:
+ for media_variant in media_info['variants']:
+ media_url = media_variant['url']
+ if media_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
+ elif media_url.endswith('.mpd'):
+ formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
+ else:
+ vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
+ a_format = {
+ 'url': media_url,
+ 'format_id': 'http-%d' % vbr if vbr else 'http',
+ 'vbr': vbr,
+ }
+ # Reported bitRate may be zero
+ if not a_format['vbr']:
+ del a_format['vbr']
+
+ _search_dimensions_in_video_url(a_format, media_url)
+
+ formats.append(a_format)
+
+ duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
+
self._sort_formats(formats)
- thumbnail = config.get('posterImageUrl')
- duration = float_or_none(config.get('duration'))
+ title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ thumbnail = config.get('posterImageUrl') or config.get('image_src')
+ duration = float_or_none(config.get('duration')) or duration
return {
'id': video_id,
- 'title': 'TwitterCard',
+ 'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
@@ -138,7 +180,6 @@ class TwitterIE(InfoExtractor):
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
- 'md5': 'db6612ec5d03355953c3ca9250c97e5e',
'info_dict': {
'id': '643211948184596480',
'ext': 'mp4',
@@ -149,6 +190,9 @@ class TwitterIE(InfoExtractor):
'uploader': 'FREE THE NIPPLE',
'uploader_id': 'freethenipple',
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
@@ -161,6 +205,7 @@ class TwitterIE(InfoExtractor):
'uploader': 'Gifs',
'uploader_id': 'giphz',
},
+ 'expected_warnings': ['height', 'width'],
}, {
'url': 'https://twitter.com/starwars/status/665052190608723968',
'md5': '39b7199856dee6cd4432e72c74bc69d4',
@@ -172,6 +217,36 @@ class TwitterIE(InfoExtractor):
'uploader_id': 'starwars',
'uploader': 'Star Wars',
},
+ }, {
+ 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
+ 'info_dict': {
+ 'id': '705235433198714880',
+ 'ext': 'mp4',
+ 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.',
+ 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."',
+ 'uploader_id': 'BTNBrentYarina',
+ 'uploader': 'Brent Yarina',
+ },
+ 'params': {
+ # The same video as https://twitter.com/i/videos/tweet/705235433198714880
+ # Test case of TwitterCardIE
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '700207533655363584',
+ 'ext': 'mp4',
+ 'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel',
+ 'description': 'jay on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'jay',
+ 'uploader_id': 'jaydingeer',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}]
def _real_extract(self, url):
@@ -208,21 +283,91 @@ class TwitterIE(InfoExtractor):
return info
mobj = re.search(r'''(?x)
- <video[^>]+class="animated-gif"[^>]+
- (?:data-height="(?P<height>\d+)")?[^>]+
- (?:data-width="(?P<width>\d+)")?[^>]+
- (?:poster="(?P<poster>[^"]+)")?[^>]*>\s*
+ <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
<source[^>]+video-src="(?P<url>[^"]+)"
''', webpage)
if mobj:
+ more_info = mobj.group('more_info')
+ height = int_or_none(self._search_regex(
+ r'data-height="(\d+)"', more_info, 'height', fatal=False))
+ width = int_or_none(self._search_regex(
+ r'data-width="(\d+)"', more_info, 'width', fatal=False))
+ thumbnail = self._search_regex(
+ r'poster="([^"]+)"', more_info, 'poster', fatal=False)
info.update({
'id': twid,
'url': mobj.group('url'),
- 'height': int_or_none(mobj.group('height')),
- 'width': int_or_none(mobj.group('width')),
- 'thumbnail': mobj.group('poster'),
+ 'height': height,
+ 'width': width,
+ 'thumbnail': thumbnail,
})
return info
- raise ExtractorError('There\'s not video in this tweet.')
+ if 'class="PlayableMedia' in webpage:
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': 'TwitterCard',
+ 'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid),
+ })
+
+ return info
+
+ raise ExtractorError('There\'s no video in this tweet.')
+
+
+class TwitterAmplifyIE(TwitterBaseIE):
+ IE_NAME = 'twitter:amplify'
+ _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})'
+
+ _TEST = {
+ 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+ 'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
+ 'info_dict': {
+ 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+ 'ext': 'mp4',
+ 'title': 'Twitter Video',
+ 'thumbnail': 're:^https?://.*',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ vmap_url = self._html_search_meta(
+ 'twitter:amplify:vmap', webpage, 'vmap url')
+ video_url = self._get_vmap_video_url(vmap_url, video_id)
+
+ thumbnails = []
+ thumbnail = self._html_search_meta(
+ 'twitter:image:src', webpage, 'thumbnail', fatal=False)
+
+ def _find_dimension(target):
+ w = int_or_none(self._html_search_meta(
+ 'twitter:%s:width' % target, webpage, fatal=False))
+ h = int_or_none(self._html_search_meta(
+ 'twitter:%s:height' % target, webpage, fatal=False))
+ return w, h
+
+ if thumbnail:
+ thumbnail_w, thumbnail_h = _find_dimension('image')
+ thumbnails.append({
+ 'url': thumbnail,
+ 'width': thumbnail_w,
+ 'height': thumbnail_h,
+ })
+
+ video_w, video_h = _find_dimension('player')
+ formats = [{
+ 'url': video_url,
+ 'width': video_w,
+ 'height': video_h,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': 'Twitter Video',
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py
new file mode 100644
index 000000000..cafc082b6
--- /dev/null
+++ b/youtube_dl/extractor/ustudio.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
+
+
+class UstudioIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
+ 'md5': '58bbfca62125378742df01fc2abbdef6',
+ 'info_dict': {
+ 'id': 'Uxu2my9bgSph',
+ 'display_id': 'san_francisco_golden_gate_bridge',
+ 'ext': 'mp4',
+ 'title': 'San Francisco: Golden Gate Bridge',
+ 'description': 'md5:23925500697f2c6d4830e387ba51a9be',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20111107',
+ 'uploader': 'Tony Farley',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ config = self._download_xml(
+ 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
+ display_id)
+
+ def extract(kind):
+ return [{
+ 'url': item.attrib['url'],
+ 'width': int_or_none(item.get('width')),
+ 'height': int_or_none(item.get('height')),
+ } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
+
+ formats = extract('video')
+ self._sort_formats(formats)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+ upload_date = unified_strdate(self._search_regex(
+ r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>',
+ webpage, 'upload date', fatal=False))
+ uploader = self._search_regex(
+ r'Uploaded by\s*<a[^>]*>([^<]+)<',
+ webpage, 'uploader', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnails': extract('image'),
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 14e945d49..e148b1ef5 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -20,6 +20,7 @@ class VGTVIE(XstreamIE):
'aftenbladet.no/tv': 'satv',
'fvn.no/fvntv': 'fvntv',
'aftenposten.no/webtv': 'aptv',
+ 'ap.vgtv.no/webtv': 'aptv',
}
_APP_NAME_TO_VENDOR = {
@@ -35,7 +36,7 @@ class VGTVIE(XstreamIE):
(?P<host>
%s
)
- /
+ /?
(?:
\#!/(?:video|live)/|
embed?.*id=
@@ -107,19 +108,27 @@ class VGTVIE(XstreamIE):
'md5': 'fd828cd29774a729bf4d4425fe192972',
'info_dict': {
'id': '21039',
- 'ext': 'mov',
+ 'ext': 'mp4',
'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',
'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',
'duration': 66,
'timestamp': 1417002452,
'upload_date': '20141126',
'view_count': int,
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
'only_matching': True,
},
+ {
+ 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -144,8 +153,6 @@ class VGTVIE(XstreamIE):
if len(video_id) == 5:
if appname == 'bttv':
info = self._extract_video_info('btno', video_id)
- elif appname == 'aptv':
- info = self._extract_video_info('ap', video_id)
streams = data['streamUrls']
stream_type = data.get('streamType')
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 3db6286e4..46c785ae1 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -1,31 +1,37 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..utils import ExtractorError
class ViceIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
-
- _TESTS = [
- {
- 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- 'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
- 'ext': 'mp4',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- 'duration': 725.983,
- },
- 'params': {
- # Requires ffmpeg (m3u8 manifest)
- 'skip_download': True,
- },
- }, {
- 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
- 'only_matching': True,
- }
- ]
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ 'duration': 725.983,
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -38,3 +44,35 @@ class ViceIE(InfoExtractor):
except ExtractorError:
raise ExtractorError('The page doesn\'t contain a video', expected=True)
return self.url_result(ooyala_url, ie='Ooyala')
+
+
+class ViceShowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
+
+ _TEST = {
+ 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2',
+ 'info_dict': {
+ 'id': 'fuck-thats-delicious-2',
+ 'title': "Fuck, That's Delicious",
+ 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.',
+ },
+ 'playlist_count': 17,
+ }
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
+
+ entries = [
+ self.url_result(video_url, ViceIE.ie_key())
+ for video_url, _ in re.findall(
+ r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
+ % ViceIE._VALID_URL, webpage)]
+
+ title = self._search_regex(
+ r'<title>(.+?)</title>', webpage, 'title', default=None)
+ if title:
+ title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
+ description = self._html_search_meta('description', webpage, 'description')
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 5e2e7cbac..4f0dcd18c 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import sanitized_Request
+from ..utils import (
+ decode_packed_codes,
+ sanitized_Request,
+)
class VideoMegaIE(InfoExtractor):
- _WORKING = False
_VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
@@ -42,8 +44,10 @@ class VideoMegaIE(InfoExtractor):
r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
thumbnail = self._search_regex(
r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+ real_codes = decode_packed_codes(webpage)
video_url = self._search_regex(
- r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
+ r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index 7c6e98026..3c78fb3d5 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -1,11 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import smuggle_url
+from .jwplatform import JWPlatformBaseIE
+from ..utils import (
+ decode_packed_codes,
+ js_to_json,
+)
-class VidziIE(InfoExtractor):
+class VidziIE(JWPlatformBaseIE):
_VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
_TEST = {
'url': 'http://vidzi.tv/cghql9yq6emu.html',
@@ -14,7 +17,6 @@ class VidziIE(InfoExtractor):
'id': 'cghql9yq6emu',
'ext': 'mp4',
'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
- 'uploader': 'vidzi.tv',
},
'params': {
# m3u8 download
@@ -29,11 +31,12 @@ class VidziIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
- # Vidzi now uses jwplayer, which can be handled by GenericIE
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': title,
- 'url': smuggle_url(url, {'to_generic': True}),
- 'ie_key': 'Generic',
- }
+ code = decode_packed_codes(webpage).replace('\\\'', '\'')
+ jwplayer_data = self._parse_json(
+ self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'),
+ video_id, transform_source=js_to_json)
+
+ info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+ info_dict['title'] = title
+
+ return info_dict
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 433fc9914..e04b814c8 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -176,13 +176,13 @@ class VikiIE(VikiBaseIE):
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
- 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+ 'md5': '63f8600c1da6f01b7640eee7eca4f1da',
'info_dict': {
'id': '50562v',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Poor Nastya [COMPLETE] - Episode 1',
'description': '',
- 'duration': 607,
+ 'duration': 606,
'timestamp': 1274949505,
'upload_date': '20101213',
'uploader': 'ad14065n',
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 3049dffb6..71c30d2cd 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -73,15 +73,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
- https?://
- (?:(?:www|(?P<player>player))\.)?
- vimeo(?P<pro>pro)?\.com/
- (?!channels/[^/?#]+/?(?:$|[?#])|album/)
- (?:.*?/)?
- (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)?
- (?:videos?/)?
- (?P<id>[0-9]+)
- /?(?:[?&].*)?(?:[#].*)?$'''
+ https?://
+ (?:
+ (?:
+ www|
+ (?P<player>player)
+ )
+ \.
+ )?
+ vimeo(?P<pro>pro)?\.com/
+ (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/)
+ (?:.*?/)?
+ (?:
+ (?:
+ play_redirect_hls|
+ moogaloop\.swf)\?clip_id=
+ )?
+ (?:videos?/)?
+ (?P<id>[0-9]+)
+ /?(?:[?&].*)?(?:[#].*)?$
+ '''
IE_NAME = 'vimeo'
_TESTS = [
{
@@ -93,6 +104,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
'description': 'md5:2d3305bad981a06ff79f027f19865021',
'upload_date': '20121220',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434',
'uploader_id': 'user7108434',
'uploader': 'Filippo Valsorda',
'duration': 10,
@@ -105,6 +117,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '68093876',
'ext': 'mp4',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus',
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
@@ -121,6 +134,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
'uploader': 'The BLN & Business of Software',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
'uploader_id': 'theblnbusinessofsoftware',
'duration': 3610,
'description': None,
@@ -135,6 +149,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'youtube-dl password protected test video',
'upload_date': '20130614',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
@@ -154,6 +169,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Key & Peele: Terrorist Interrogation',
'description': 'md5:8678b246399b070816b12313e8b4eb5c',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio',
'uploader_id': 'atencio',
'uploader': 'Peter Atencio',
'upload_date': '20130927',
@@ -169,6 +185,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': 'The New Vimeo Player (You Know, For Videos)',
'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
'upload_date': '20131015',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff',
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
@@ -183,6 +200,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Pier Solar OUYA Official Trailer',
'uploader': 'Tulio Gonçalves',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593',
'uploader_id': 'user28849593',
},
},
@@ -195,6 +213,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',
'uploader': 'The DMCI',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci',
'uploader_id': 'dmci',
'upload_date': '20111220',
'description': 'md5:ae23671e82d05415868f7ad1aec21147',
@@ -269,9 +288,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
- headers = std_headers
+ headers = std_headers.copy()
if 'http_headers' in data:
- headers = headers.copy()
headers.update(data['http_headers'])
if 'Referer' not in headers:
headers['Referer'] = url
@@ -286,7 +304,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
- request = sanitized_Request(url, None, headers)
+ request = sanitized_Request(url, headers=headers)
try:
webpage = self._download_webpage(request, video_id)
except ExtractorError as ee:
@@ -370,9 +388,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
# Extract title
video_title = config['video']['title']
- # Extract uploader and uploader_id
- video_uploader = config['video']['owner']['name']
- video_uploader_id = config['video']['owner']['url'].split('/')[-1] if config['video']['owner']['url'] else None
+ # Extract uploader, uploader_url and uploader_id
+ video_uploader = config['video'].get('owner', {}).get('name')
+ video_uploader_url = config['video'].get('owner', {}).get('url')
+ video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
# Extract video thumbnail
video_thumbnail = config['video'].get('thumbnail')
@@ -473,6 +492,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
return {
'id': video_id,
'uploader': video_uploader,
+ 'uploader_url': video_uploader_url,
'uploader_id': video_uploader_id,
'upload_date': video_upload_date,
'title': video_title,
@@ -488,6 +508,38 @@ class VimeoIE(VimeoBaseInfoExtractor):
}
+class VimeoOndemandIE(VimeoBaseInfoExtractor):
+ IE_NAME = 'vimeo:ondemand'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ # ondemand video not available via https://vimeo.com/id
+ 'url': 'https://vimeo.com/ondemand/20704',
+ 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35',
+ 'info_dict': {
+ 'id': '105442900',
+ 'ext': 'mp4',
+ 'title': 'המעבדה - במאי יותם פלדמן',
+ 'uploader': 'גם סרטים',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms',
+ 'uploader_id': 'gumfilms',
+ },
+ }, {
+ 'url': 'https://vimeo.com/ondemand/nazmaalik',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vimeo.com/ondemand/141692381',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key())
+
+
class VimeoChannelIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:channel'
_VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 0805e3c08..d560a4b5e 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -11,6 +11,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ int_or_none,
orderedSet,
sanitized_Request,
str_to_int,
@@ -141,10 +142,10 @@ class VKIE(InfoExtractor):
'url': 'https://vk.com/video276849682_170681728',
'info_dict': {
'id': 'V3K4mi0SYkc',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
- 'duration': 179,
+ 'duration': 178,
'upload_date': '20130116',
'uploader': "Children's Joy Foundation",
'uploader_id': 'thecjf',
@@ -152,6 +153,19 @@ class VKIE(InfoExtractor):
},
},
{
+ # video key is extra_data not url\d+
+ 'url': 'http://vk.com/video-110305615_171782105',
+ 'md5': 'e13fcda136f99764872e739d13fac1d1',
+ 'info_dict': {
+ 'id': '171782105',
+ 'ext': 'mp4',
+ 'title': 'S-Dance, репетиции к The way show',
+ 'uploader': 'THE WAY SHOW | 17 апреля',
+ 'upload_date': '20160207',
+ 'view_count': int,
+ },
+ },
+ {
# removed video, just testing that we match the pattern
'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
'only_matching': True,
@@ -298,12 +312,17 @@ class VKIE(InfoExtractor):
view_count = str_to_int(self._search_regex(
r'([\d,.]+)', views, 'view count', fatal=False))
- formats = [{
- 'format_id': k,
- 'url': v,
- 'width': int(k[len('url'):]),
- } for k, v in data.items()
- if k.startswith('url')]
+ formats = []
+ for k, v in data.items():
+ if not k.startswith('url') and k != 'extra_data' or not v:
+ continue
+ height = int_or_none(self._search_regex(
+ r'^url(\d+)', k, 'height', default=None))
+ formats.append({
+ 'format_id': k,
+ 'url': v,
+ 'height': height,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index affcc52f6..37cf3d309 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -12,7 +12,7 @@ from ..utils import (
class WatIE(InfoExtractor):
- _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html'
+ _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)'
IE_NAME = 'wat.tv'
_TESTS = [
{
@@ -54,10 +54,12 @@ class WatIE(InfoExtractor):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
mobj = re.match(self._VALID_URL, url)
- short_id = mobj.group('short_id')
display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id or short_id)
- real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+ real_id = mobj.group('real_id')
+ if not real_id:
+ short_id = mobj.group('short_id')
+ webpage = self._download_webpage(url, display_id or short_id)
+ real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
video_info = self.download_video_info(real_id)
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
index 2037d9b3d..7aea47ed5 100644
--- a/youtube_dl/extractor/webofstories.py
+++ b/youtube_dl/extractor/webofstories.py
@@ -12,38 +12,52 @@ class WebOfStoriesIE(InfoExtractor):
_VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
_GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
_USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
- _TESTS = [
- {
- 'url': 'http://www.webofstories.com/play/hans.bethe/71',
- 'md5': '373e4dd915f60cfe3116322642ddf364',
- 'info_dict': {
- 'id': '4536',
- 'ext': 'mp4',
- 'title': 'The temperature of the sun',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Hans Bethe talks about calculating the temperature of the sun',
- 'duration': 238,
- }
+ _TESTS = [{
+ 'url': 'http://www.webofstories.com/play/hans.bethe/71',
+ 'md5': '373e4dd915f60cfe3116322642ddf364',
+ 'info_dict': {
+ 'id': '4536',
+ 'ext': 'mp4',
+ 'title': 'The temperature of the sun',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Hans Bethe talks about calculating the temperature of the sun',
+ 'duration': 238,
+ }
+ }, {
+ 'url': 'http://www.webofstories.com/play/55908',
+ 'md5': '2985a698e1fe3211022422c4b5ed962c',
+ 'info_dict': {
+ 'id': '55908',
+ 'ext': 'mp4',
+ 'title': 'The story of Gemmata obscuriglobus',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+ 'duration': 169,
+ },
+ 'skip': 'notfound',
+ }, {
+ # malformed og:title meta
+ 'url': 'http://www.webofstories.com/play/54215?o=MS',
+ 'info_dict': {
+ 'id': '54215',
+ 'ext': 'mp4',
+ 'title': '"A Leg to Stand On"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Oliver Sacks talks about the death and resurrection of a limb',
+ 'duration': 97,
},
- {
- 'url': 'http://www.webofstories.com/play/55908',
- 'md5': '2985a698e1fe3211022422c4b5ed962c',
- 'info_dict': {
- 'id': '55908',
- 'ext': 'mp4',
- 'title': 'The story of Gemmata obscuriglobus',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
- 'duration': 169,
- }
+ 'params': {
+ 'skip_download': True,
},
- ]
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
+ # Sometimes og:title meta is malformed
+ title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+ r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index 041ff6c55..fb0accac7 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -20,7 +20,7 @@ class WimpIE(InfoExtractor):
'md5': '4e2986c793694b55b37cf92521d12bb4',
'info_dict': {
'id': 'clowncar',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'It\'s like a clown car.',
'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',
},
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index fdb16d91c..41061dd31 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -35,7 +35,8 @@ class WistiaIE(InfoExtractor):
formats = []
thumbnails = []
- for atype, a in data['assets'].items():
+ for a in data['assets']:
+ atype = a.get('type')
if atype == 'still':
thumbnails.append({
'url': a['url'],
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index a3236e66c..94abdb4f3 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -17,7 +17,7 @@ class XFileShareIE(InfoExtractor):
IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/
+ (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
@@ -81,6 +81,13 @@ class XFileShareIE(InfoExtractor):
'ext': 'mp4',
'title': 'test'
}
+ }, {
+ 'url': 'http://powerwatch.pw/duecjibvicbu',
+ 'info_dict': {
+ 'id': 'duecjibvicbu',
+ 'ext': 'mp4',
+ 'title': 'Big Buck Bunny trailer',
+ },
}]
def _real_extract(self, url):
@@ -112,6 +119,7 @@ class XFileShareIE(InfoExtractor):
title = (self._search_regex(
[r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>',
+ r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+) ',
r'<h2 class="video-page-head">([^<]+)</h2>'],
webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index d3cc1a29f..e699e663f 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -10,13 +10,27 @@ from ..compat import (
compat_urllib_parse,
)
from ..utils import (
+ ExtractorError,
int_or_none,
float_or_none,
sanitized_Request,
)
-class YandexMusicTrackIE(InfoExtractor):
+class YandexMusicBaseIE(InfoExtractor):
+ @staticmethod
+ def _handle_error(response):
+ error = response.get('error')
+ if error:
+ raise ExtractorError(error, expected=True)
+
+ def _download_json(self, *args, **kwargs):
+ response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
+ self._handle_error(response)
+ return response
+
+
+class YandexMusicTrackIE(YandexMusicBaseIE):
IE_NAME = 'yandexmusic:track'
IE_DESC = 'Яндекс.Музыка - Трек'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
@@ -73,7 +87,7 @@ class YandexMusicTrackIE(InfoExtractor):
return self._get_track_info(track)
-class YandexMusicPlaylistBaseIE(InfoExtractor):
+class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
def _build_playlist(self, tracks):
return [
self.url_result(
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index b29baafc4..1124fe6c2 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor):
links = []
sources = self._search_regex(
- r'sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+ r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
if sources:
for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
links.append(link)
@@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor):
}
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+ # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# We will benefit from it by extracting some metadata
- mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+ mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
if mobj:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index e24dd3e5b..27e67feb4 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
import itertools
import json
import os.path
+import random
import re
import time
import traceback
@@ -382,7 +383,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@@ -401,12 +404,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
'alt_title': 'I Love It (feat. Charli XCX)',
- 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',
+ 'license': 'Standard YouTube License',
'creator': 'Icona Pop',
}
},
@@ -422,6 +427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
+ 'license': 'Standard YouTube License',
'creator': 'Justin Timberlake',
'age_limit': 18,
}
@@ -437,6 +444,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
'uploader': 'SET India',
'uploader_id': 'setindia',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
}
},
@@ -449,7 +458,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@@ -468,8 +479,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'm4a',
'upload_date': '20121002',
'uploader_id': '8KVIDEO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
+ 'license': 'Standard YouTube License',
'title': 'UHDTV TEST 8K VIDEO.mp4'
},
'params': {
@@ -488,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
+ 'license': 'Standard YouTube License',
},
'params': {
'youtube_include_dash_manifest': True,
@@ -506,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
+ 'license': 'Standard YouTube License',
'creator': 'Taylor Swift',
},
'params': {
@@ -522,6 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20100909',
'uploader': 'The Amazing Atheist',
'uploader_id': 'TheAmazingAtheist',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
+ 'license': 'Standard YouTube License',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
@@ -536,7 +553,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@@ -550,7 +569,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
'upload_date': '20110629',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@@ -562,9 +583,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20100430',
'uploader_id': 'deadmau5',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',
'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5',
+ 'license': 'Standard YouTube License',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
},
@@ -580,6 +603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20150827',
'uploader_id': 'olympic',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
+ 'license': 'Standard YouTube License',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
@@ -597,8 +622,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'stretched_ratio': 16 / 9.,
'upload_date': '20110310',
'uploader_id': 'AllenMeow',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫艾倫',
+ 'license': 'Standard YouTube License',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
},
@@ -629,7 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:116377fd2963b81ec4ce64b542173306',
'upload_date': '20150625',
'uploader_id': 'dorappi2000',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000',
+ 'license': 'Standard YouTube License',
'formats': 'mincount:33',
},
},
@@ -644,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Airtek',
'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+ 'license': 'Standard YouTube License',
'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
},
'params': {
@@ -668,6 +698,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -678,6 +710,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -688,6 +722,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -698,6 +734,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}],
'params': {
@@ -731,7 +769,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'upload_date': '20151119',
'uploader_id': 'IronSoulElf',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf',
+ 'license': 'Standard YouTube License',
'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
},
'params': {
@@ -760,6 +800,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
},
{
+ # Video licensed under Creative Commons
+ 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
+ 'info_dict': {
+ 'id': 'M4gD1WSo5mA',
+ 'ext': 'mp4',
+ 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
+ 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
+ 'upload_date': '20150127',
+ 'uploader_id': 'BerkmanCenter',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
+ 'uploader': 'BerkmanCenter',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Channel-like uploader_url
+ 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
+ 'info_dict': {
+ 'id': 'eQcmzGIKrzg',
+ 'ext': 'mp4',
+ 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
+ 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+ 'upload_date': '20151119',
+ 'uploader': 'Bernie 2016',
+ 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
'only_matching': True,
}
@@ -975,40 +1051,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return {}
try:
args = player_config['args']
- caption_url = args['ttsurl']
- if not caption_url:
- self._downloader.report_warning(err_msg)
- return {}
- timestamp = args['timestamp']
- # We get the available subtitles
- list_params = compat_urllib_parse.urlencode({
- 'type': 'list',
- 'tlangs': 1,
- 'asrs': 1,
- })
- list_url = caption_url + '&' + list_params
- caption_list = self._download_xml(list_url, video_id)
- original_lang_node = caption_list.find('track')
- if original_lang_node is None:
- self._downloader.report_warning('Video doesn\'t have automatic captions')
- return {}
- original_lang = original_lang_node.attrib['lang_code']
- caption_kind = original_lang_node.attrib.get('kind', '')
+ caption_url = args.get('ttsurl')
+ if caption_url:
+ timestamp = args['timestamp']
+ # We get the available subtitles
+ list_params = compat_urllib_parse.urlencode({
+ 'type': 'list',
+ 'tlangs': 1,
+ 'asrs': 1,
+ })
+ list_url = caption_url + '&' + list_params
+ caption_list = self._download_xml(list_url, video_id)
+ original_lang_node = caption_list.find('track')
+ if original_lang_node is None:
+ self._downloader.report_warning('Video doesn\'t have automatic captions')
+ return {}
+ original_lang = original_lang_node.attrib['lang_code']
+ caption_kind = original_lang_node.attrib.get('kind', '')
+
+ sub_lang_list = {}
+ for lang_node in caption_list.findall('target'):
+ sub_lang = lang_node.attrib['lang_code']
+ sub_formats = []
+ for ext in self._SUBTITLE_FORMATS:
+ params = compat_urllib_parse.urlencode({
+ 'lang': original_lang,
+ 'tlang': sub_lang,
+ 'fmt': ext,
+ 'ts': timestamp,
+ 'kind': caption_kind,
+ })
+ sub_formats.append({
+ 'url': caption_url + '&' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[sub_lang] = sub_formats
+ return sub_lang_list
+
+ # Some videos don't provide ttsurl but rather caption_tracks and
+ # caption_translation_languages (e.g. 20LmZk1hakA)
+ caption_tracks = args['caption_tracks']
+ caption_translation_languages = args['caption_translation_languages']
+ caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+ parsed_caption_url = compat_urlparse.urlparse(caption_url)
+ caption_qs = compat_parse_qs(parsed_caption_url.query)
sub_lang_list = {}
- for lang_node in caption_list.findall('target'):
- sub_lang = lang_node.attrib['lang_code']
+ for lang in caption_translation_languages.split(','):
+ lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+ sub_lang = lang_qs.get('lc', [None])[0]
+ if not sub_lang:
+ continue
sub_formats = []
for ext in self._SUBTITLE_FORMATS:
- params = compat_urllib_parse.urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': ext,
- 'ts': timestamp,
- 'kind': caption_kind,
+ caption_qs.update({
+ 'tlang': [sub_lang],
+ 'fmt': [ext],
})
+ sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
+ query=compat_urllib_parse.urlencode(caption_qs, True)))
sub_formats.append({
- 'url': caption_url + '&' + params,
+ 'url': sub_url,
'ext': ext,
})
sub_lang_list[sub_lang] = sub_formats
@@ -1019,6 +1122,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning(err_msg)
return {}
+ def _mark_watched(self, video_id, video_info):
+ playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+ if not playback_url:
+ return
+ parsed_playback_url = compat_urlparse.urlparse(playback_url)
+ qs = compat_urlparse.parse_qs(parsed_playback_url.query)
+
+ # cpn generation algorithm is reverse engineered from base.js.
+ # In fact it works even with dummy cpn.
+ CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
+ cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
+
+ qs.update({
+ 'ver': ['2'],
+ 'cpn': [cpn],
+ })
+ playback_url = compat_urlparse.urlunparse(
+ parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
+ self._download_webpage(
+ playback_url, video_id, 'Marking watched',
+ 'Unable to mark watched', fatal=False)
+
@classmethod
def extract_id(cls, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
@@ -1245,9 +1371,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# uploader_id
video_uploader_id = None
- mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
+ video_uploader_url = None
+ mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+ video_webpage)
if mobj is not None:
- video_uploader_id = mobj.group(1)
+ video_uploader_id = mobj.group('uploader_id')
+ video_uploader_url = mobj.group('uploader_url')
else:
self._downloader.report_warning('unable to extract uploader nickname')
@@ -1275,6 +1405,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
+ video_license = self._html_search_regex(
+ r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+ video_webpage, 'license', default=None)
+
m_music = re.search(
r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
video_webpage)
@@ -1348,6 +1482,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
+ formats_spec = {}
+ fmt_list = video_info.get('fmt_list', [''])[0]
+ if fmt_list:
+ for fmt in fmt_list.split(','):
+ spec = fmt.split('/')
+ if len(spec) > 1:
+ width_height = spec[1].split('x')
+ if len(width_height) == 2:
+ formats_spec[spec[0]] = {
+ 'resolution': spec[1],
+ 'width': int_or_none(width_height[0]),
+ 'height': int_or_none(width_height[1]),
+ }
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
@@ -1416,6 +1563,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
if format_id in self._formats:
dct.update(self._formats[format_id])
+ if format_id in formats_spec:
+ dct.update(formats_spec[format_id])
# Some itags are not included in DASH manifest thus corresponding formats will
# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
@@ -1528,11 +1677,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._sort_formats(formats)
+ self.mark_watched(video_id, video_info)
+
return {
'id': video_id,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
+ 'uploader_url': video_uploader_url,
'upload_date': upload_date,
+ 'license': video_license,
'creator': video_creator,
'title': video_title,
'alt_title': video_alt_title,
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index c619a75e2..81c22a627 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -137,6 +137,10 @@ class ZDFIE(InfoExtractor):
formats.extend(self._extract_smil_formats(
video_url, video_id, fatal=False))
elif ext == 'm3u8':
+ # the certificates are misconfigured (see
+ # https://github.com/rg3/youtube-dl/issues/8665)
+ if video_url.startswith('https://'):
+ continue
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 3afa8bb6f..9dd7a8034 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -171,6 +171,14 @@ def parseOpts(overrideArguments=None):
default=False,
help='Do not extract the videos of a playlist, only list them.')
general.add_option(
+ '--mark-watched',
+ action='store_true', dest='mark_watched', default=False,
+ help='Mark videos watched (YouTube only)')
+ general.add_option(
+ '--no-mark-watched',
+ action='store_false', dest='mark_watched', default=False,
+ help='Do not mark videos watched (YouTube only)')
+ general.add_option(
'--no-color', '--no-colors',
action='store_true', dest='no_color',
default=False,
diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py
index 0d8ef6ca2..3ea518399 100644
--- a/youtube_dl/postprocessor/__init__.py
+++ b/youtube_dl/postprocessor/__init__.py
@@ -6,6 +6,7 @@ from .ffmpeg import (
FFmpegEmbedSubtitlePP,
FFmpegExtractAudioPP,
FFmpegFixupStretchedPP,
+ FFmpegFixupM3u8PP,
FFmpegFixupM4aPP,
FFmpegMergerPP,
FFmpegMetadataPP,
@@ -26,6 +27,7 @@ __all__ = [
'ExecAfterDownloadPP',
'FFmpegEmbedSubtitlePP',
'FFmpegExtractAudioPP',
+ 'FFmpegFixupM3u8PP',
'FFmpegFixupM4aPP',
'FFmpegFixupStretchedPP',
'FFmpegMergerPP',
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py
index e19dbf73d..3bad5a266 100644
--- a/youtube_dl/postprocessor/embedthumbnail.py
+++ b/youtube_dl/postprocessor/embedthumbnail.py
@@ -40,7 +40,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
'Skipping embedding the thumbnail because the file is missing.')
return [], info
- if info['ext'] == 'mp3':
+ if info['ext'] in ('mp3', 'mkv'):
options = [
'-c', 'copy', '-map', '0', '-map', '1',
'-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index cc7aaeda3..a8819f258 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -404,10 +404,6 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
- # https://github.com/rg3/youtube-dl/issues/8350
- if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False):
- options.extend(['-bsf:a', 'aac_adtstoasc'])
-
self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename))
@@ -480,6 +476,21 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor):
return [], info
+class FFmpegFixupM3u8PP(FFmpegPostProcessor):
+ def run(self, info):
+ filename = info['filepath']
+ temp_filename = prepend_extension(filename, 'temp')
+
+ options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
+ self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename)
+ self.run_ffmpeg(filename, temp_filename, options)
+
+ os.remove(encodeFilename(filename))
+ os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+ return [], info
+
+
class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
def __init__(self, downloader=None, format=None):
super(FFmpegSubtitlesConvertorPP, self).__init__(downloader)
diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py
index 480d48d05..e39ca60aa 100644
--- a/youtube_dl/postprocessor/xattrpp.py
+++ b/youtube_dl/postprocessor/xattrpp.py
@@ -6,6 +6,7 @@ import sys
import errno
from .common import PostProcessor
+from ..compat import compat_os_name
from ..utils import (
check_executable,
hyphenate_date,
@@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor):
raise XAttrMetadataError(e.errno, e.strerror)
except ImportError:
- if os.name == 'nt':
+ if compat_os_name == 'nt':
# Write xattrs to NTFS Alternate Data Streams:
# http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
def write_xattr(path, key, value):
@@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor):
'Unable to write extended attributes due to too long values.')
else:
msg = 'This filesystem doesn\'t support extended attributes. '
- if os.name == 'nt':
+ if compat_os_name == 'nt':
msg += 'You need to use NTFS.'
else:
msg += '(You may have to enable them in your /etc/fstab)'
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 672ce05ea..9fd0ec8d5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -4,6 +4,7 @@
from __future__ import unicode_literals
import base64
+import binascii
import calendar
import codecs
import contextlib
@@ -159,8 +160,6 @@ if sys.version_info >= (2, 7):
def find_xpath_attr(node, xpath, key, val=None):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z_-]+$', key)
- if val:
- assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
return node.find(expr)
else:
@@ -466,6 +465,10 @@ def encodeFilename(s, for_subprocess=False):
if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
return s
+ # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
+ if sys.platform.startswith('java'):
+ return s
+
return s.encode(get_subprocess_encoding(), 'ignore')
@@ -904,9 +907,9 @@ def unified_strdate(date_str, day_first=True):
'%d %b %Y',
'%B %d %Y',
'%b %d %Y',
- '%b %dst %Y %I:%M%p',
- '%b %dnd %Y %I:%M%p',
- '%b %dth %Y %I:%M%p',
+ '%b %dst %Y %I:%M',
+ '%b %dnd %Y %I:%M',
+ '%b %dth %Y %I:%M',
'%Y %m %d',
'%Y-%m-%d',
'%Y/%m/%d',
@@ -1216,13 +1219,23 @@ if sys.platform == 'win32':
raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
else:
- import fcntl
+ # Some platforms, such as Jython, is missing fcntl
+ try:
+ import fcntl
- def _lock_file(f, exclusive):
- fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+ def _lock_file(f, exclusive):
+ fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
- def _unlock_file(f):
- fcntl.flock(f, fcntl.LOCK_UN)
+ def _unlock_file(f):
+ fcntl.flock(f, fcntl.LOCK_UN)
+ except ImportError:
+ UNSUPPORTED_MSG = 'file locking is not supported on this platform'
+
+ def _lock_file(f, exclusive):
+ raise IOError(UNSUPPORTED_MSG)
+
+ def _unlock_file(f):
+ raise IOError(UNSUPPORTED_MSG)
class locked_file(object):
@@ -1303,6 +1316,17 @@ def format_bytes(bytes):
return '%.2f%s' % (converted, suffix)
+def lookup_unit_table(unit_table, s):
+ units_re = '|'.join(re.escape(u) for u in unit_table)
+ m = re.match(
+ r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+ if not m:
+ return None
+ num_str = m.group('num').replace(',', '.')
+ mult = unit_table[m.group('unit')]
+ return int(float(num_str) * mult)
+
+
def parse_filesize(s):
if s is None:
return None
@@ -1346,15 +1370,28 @@ def parse_filesize(s):
'Yb': 1000 ** 8,
}
- units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
- m = re.match(
- r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
- if not m:
+ return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+ if s is None:
return None
- num_str = m.group('num').replace(',', '.')
- mult = _UNIT_TABLE[m.group('unit')]
- return int(float(num_str) * mult)
+ s = s.strip()
+
+ if re.match(r'^[\d,.]+$', s):
+ return str_to_int(s)
+
+ _UNIT_TABLE = {
+ 'k': 1000,
+ 'K': 1000,
+ 'm': 1000 ** 2,
+ 'M': 1000 ** 2,
+ 'kk': 1000 ** 2,
+ 'KK': 1000 ** 2,
+ }
+
+ return lookup_unit_table(_UNIT_TABLE, s)
def month_by_name(name):
@@ -1386,6 +1423,12 @@ def fix_xml_ampersands(xml_str):
def setproctitle(title):
assert isinstance(title, compat_str)
+
+ # ctypes in Jython is not complete
+ # http://bugs.jython.org/issue2148
+ if sys.platform.startswith('java'):
+ return
+
try:
libc = ctypes.cdll.LoadLibrary('libc.so.6')
except OSError:
@@ -1569,9 +1612,12 @@ class PagedList(object):
class OnDemandPagedList(PagedList):
- def __init__(self, pagefunc, pagesize):
+ def __init__(self, pagefunc, pagesize, use_cache=False):
self._pagefunc = pagefunc
self._pagesize = pagesize
+ self._use_cache = use_cache
+ if use_cache:
+ self._cache = {}
def getslice(self, start=0, end=None):
res = []
@@ -1581,7 +1627,13 @@ class OnDemandPagedList(PagedList):
if start >= nextfirstid:
continue
- page_results = list(self._pagefunc(pagenum))
+ page_results = None
+ if self._use_cache:
+ page_results = self._cache.get(pagenum)
+ if page_results is None:
+ page_results = list(self._pagefunc(pagenum))
+ if self._use_cache:
+ self._cache[pagenum] = page_results
startv = (
start % self._pagesize
@@ -1711,6 +1763,15 @@ def urlencode_postdata(*args, **kargs):
return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+def update_url_query(url, query):
+ parsed_url = compat_urlparse.urlparse(url)
+ qs = compat_parse_qs(parsed_url.query)
+ qs.update(query)
+ qs = encode_dict(qs)
+ return compat_urlparse.urlunparse(parsed_url._replace(
+ query=compat_urllib_parse.urlencode(qs, True)))
+
+
def encode_dict(d, encoding='utf-8'):
def encode(v):
return v.encode(encoding) if isinstance(v, compat_basestring) else v
@@ -1835,11 +1896,21 @@ def error_to_compat_str(err):
def mimetype2ext(mt):
+ ext = {
+ 'audio/mp4': 'm4a',
+ }.get(mt)
+ if ext is not None:
+ return ext
+
_, _, res = mt.rpartition('/')
return {
'3gpp': '3gp',
+ 'smptett+xml': 'tt',
+ 'srt': 'srt',
+ 'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
+ 'vtt': 'vtt',
'x-flv': 'flv',
'x-mp4-fragmented': 'mp4',
'x-ms-wmv': 'wmv',
@@ -2582,3 +2653,58 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
return None # No Proxy
return compat_urllib_request.ProxyHandler.proxy_open(
self, req, proxy, type)
+
+
+def ohdave_rsa_encrypt(data, exponent, modulus):
+ '''
+ Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
+
+ Input:
+ data: data to encrypt, bytes-like object
+ exponent, modulus: parameter e and N of RSA algorithm, both integer
+ Output: hex string of encrypted data
+
+ Limitation: supports one block encryption only
+ '''
+
+ payload = int(binascii.hexlify(data[::-1]), 16)
+ encrypted = pow(payload, exponent, modulus)
+ return '%x' % encrypted
+
+
+def encode_base_n(num, n, table=None):
+ FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ if not table:
+ table = FULL_TABLE[:n]
+
+ if n > len(table):
+ raise ValueError('base %d exceeds table length %d' % (n, len(table)))
+
+ if num == 0:
+ return table[0]
+
+ ret = ''
+ while num:
+ ret = table[num % n] + ret
+ num = num // n
+ return ret
+
+
+def decode_packed_codes(code):
+ mobj = re.search(
+ r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
+ code)
+ obfucasted_code, base, count, symbols = mobj.groups()
+ base = int(base)
+ count = int(count)
+ symbols = symbols.split('|')
+ symbol_table = {}
+
+ while count:
+ count -= 1
+ base_n_count = encode_base_n(count, base)
+ symbol_table[base_n_count] = symbols[count] or base_n_count
+
+ return re.sub(
+ r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
+ obfucasted_code)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 9aca8001a..246f5740d 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2016.02.13'
+__version__ = '2016.03.06'