aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py53
-rw-r--r--youtube_dl/extractor/aenetworks.py2
-rw-r--r--youtube_dl/extractor/animeondemand.py160
-rw-r--r--youtube_dl/extractor/aol.py70
-rw-r--r--youtube_dl/extractor/appletrailers.py5
-rw-r--r--youtube_dl/extractor/arte.py117
-rw-r--r--youtube_dl/extractor/audimedia.py19
-rw-r--r--youtube_dl/extractor/audioboom.py66
-rw-r--r--youtube_dl/extractor/bbc.py32
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/bokecc.py60
-rw-r--r--youtube_dl/extractor/brightcove.py22
-rw-r--r--youtube_dl/extractor/c56.py22
-rw-r--r--youtube_dl/extractor/canvas.py33
-rw-r--r--youtube_dl/extractor/cbc.py113
-rw-r--r--youtube_dl/extractor/cbsnews.py20
-rw-r--r--youtube_dl/extractor/ccc.py2
-rw-r--r--youtube_dl/extractor/ceskatelevize.py8
-rw-r--r--youtube_dl/extractor/cinemassacre.py24
-rw-r--r--youtube_dl/extractor/cnet.py6
-rw-r--r--youtube_dl/extractor/cnn.py12
-rw-r--r--youtube_dl/extractor/collegerama.py6
-rw-r--r--youtube_dl/extractor/comedycentral.py11
-rw-r--r--youtube_dl/extractor/common.py168
-rw-r--r--youtube_dl/extractor/crunchyroll.py74
-rw-r--r--youtube_dl/extractor/dailymotion.py17
-rw-r--r--youtube_dl/extractor/douyutv.py23
-rw-r--r--youtube_dl/extractor/dplay.py123
-rw-r--r--youtube_dl/extractor/drbonanza.py10
-rw-r--r--youtube_dl/extractor/dw.py85
-rw-r--r--youtube_dl/extractor/eighttracks.py108
-rw-r--r--youtube_dl/extractor/ellentv.py2
-rw-r--r--youtube_dl/extractor/elpais.py31
-rw-r--r--youtube_dl/extractor/engadget.py25
-rw-r--r--youtube_dl/extractor/everyonesmixtape.py12
-rw-r--r--youtube_dl/extractor/exfm.py2
-rw-r--r--youtube_dl/extractor/facebook.py111
-rw-r--r--youtube_dl/extractor/faz.py2
-rw-r--r--youtube_dl/extractor/fc2.py2
-rw-r--r--youtube_dl/extractor/fivemin.py51
-rw-r--r--youtube_dl/extractor/foxnews.py4
-rw-r--r--youtube_dl/extractor/franceinter.py2
-rw-r--r--youtube_dl/extractor/francetv.py2
-rw-r--r--youtube_dl/extractor/freespeech.py2
-rw-r--r--youtube_dl/extractor/freevideo.py4
-rw-r--r--youtube_dl/extractor/gameinformer.py31
-rw-r--r--youtube_dl/extractor/generic.py65
-rw-r--r--youtube_dl/extractor/globo.py2
-rw-r--r--youtube_dl/extractor/googledrive.py14
-rw-r--r--youtube_dl/extractor/hentaistigma.py4
-rw-r--r--youtube_dl/extractor/imdb.py2
-rw-r--r--youtube_dl/extractor/indavideo.py2
-rw-r--r--youtube_dl/extractor/infoq.py31
-rw-r--r--youtube_dl/extractor/iqiyi.py367
-rw-r--r--youtube_dl/extractor/jeuxvideo.py2
-rw-r--r--youtube_dl/extractor/jwplatform.py63
-rw-r--r--youtube_dl/extractor/kaltura.py67
-rw-r--r--youtube_dl/extractor/kankan.py2
-rw-r--r--youtube_dl/extractor/khanacademy.py4
-rw-r--r--youtube_dl/extractor/kusi.py99
-rw-r--r--youtube_dl/extractor/kuwo.py1
-rw-r--r--youtube_dl/extractor/laola1tv.py131
-rw-r--r--youtube_dl/extractor/leeco.py (renamed from youtube_dl/extractor/letv.py)111
-rw-r--r--youtube_dl/extractor/lifenews.py101
-rw-r--r--youtube_dl/extractor/liveleak.py2
-rw-r--r--youtube_dl/extractor/livestream.py8
-rw-r--r--youtube_dl/extractor/makerschannel.py40
-rw-r--r--youtube_dl/extractor/mdr.py9
-rw-r--r--youtube_dl/extractor/minoto.py56
-rw-r--r--youtube_dl/extractor/mit.py2
-rw-r--r--youtube_dl/extractor/mixcloud.py5
-rw-r--r--youtube_dl/extractor/mofosex.py2
-rw-r--r--youtube_dl/extractor/motherless.py94
-rw-r--r--youtube_dl/extractor/mtv.py5
-rw-r--r--youtube_dl/extractor/myspass.py4
-rw-r--r--youtube_dl/extractor/myvideo.py1
-rw-r--r--youtube_dl/extractor/nba.py101
-rw-r--r--youtube_dl/extractor/nbc.py2
-rw-r--r--youtube_dl/extractor/nerdcubed.py4
-rw-r--r--youtube_dl/extractor/noz.py82
-rw-r--r--youtube_dl/extractor/nrk.py38
-rw-r--r--youtube_dl/extractor/orf.py1
-rw-r--r--youtube_dl/extractor/pbs.py123
-rw-r--r--youtube_dl/extractor/plays.py5
-rw-r--r--youtube_dl/extractor/pornhub.py88
-rw-r--r--youtube_dl/extractor/pornovoisines.py2
-rw-r--r--youtube_dl/extractor/pyvideo.py6
-rw-r--r--youtube_dl/extractor/radiobremen.py12
-rw-r--r--youtube_dl/extractor/radiofrance.py6
-rw-r--r--youtube_dl/extractor/rbmaradio.py10
-rw-r--r--youtube_dl/extractor/reverbnation.py12
-rw-r--r--youtube_dl/extractor/revision3.py97
-rw-r--r--youtube_dl/extractor/rice.py116
-rw-r--r--youtube_dl/extractor/ringtv.py14
-rw-r--r--youtube_dl/extractor/rte.py2
-rw-r--r--youtube_dl/extractor/rtl2.py2
-rw-r--r--youtube_dl/extractor/rtve.py30
-rw-r--r--youtube_dl/extractor/safari.py64
-rw-r--r--youtube_dl/extractor/screenjunkies.py138
-rw-r--r--youtube_dl/extractor/screenwavemedia.py20
-rw-r--r--youtube_dl/extractor/senateisvp.py62
-rw-r--r--youtube_dl/extractor/sexu.py25
-rw-r--r--youtube_dl/extractor/slutload.py4
-rw-r--r--youtube_dl/extractor/smotri.py4
-rw-r--r--youtube_dl/extractor/snotr.py2
-rw-r--r--youtube_dl/extractor/soundcloud.py2
-rw-r--r--youtube_dl/extractor/space.py38
-rw-r--r--youtube_dl/extractor/steam.py16
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/tenplay.py8
-rw-r--r--youtube_dl/extractor/tf1.py6
-rw-r--r--youtube_dl/extractor/theplatform.py23
-rw-r--r--youtube_dl/extractor/thesixtyone.py32
-rw-r--r--youtube_dl/extractor/tlc.py35
-rw-r--r--youtube_dl/extractor/tnaflix.py34
-rw-r--r--youtube_dl/extractor/traileraddict.py8
-rw-r--r--youtube_dl/extractor/tudou.py2
-rw-r--r--youtube_dl/extractor/twitch.py27
-rw-r--r--youtube_dl/extractor/twitter.py259
-rw-r--r--youtube_dl/extractor/usatoday.py48
-rw-r--r--youtube_dl/extractor/ustudio.py67
-rw-r--r--youtube_dl/extractor/vbox7.py2
-rw-r--r--youtube_dl/extractor/vgtv.py17
-rw-r--r--youtube_dl/extractor/vice.py78
-rw-r--r--youtube_dl/extractor/videomega.py10
-rw-r--r--youtube_dl/extractor/videopremium.py10
-rw-r--r--youtube_dl/extractor/vidzi.py27
-rw-r--r--youtube_dl/extractor/viki.py6
-rw-r--r--youtube_dl/extractor/vimeo.py104
-rw-r--r--youtube_dl/extractor/vine.py6
-rw-r--r--youtube_dl/extractor/vk.py35
-rw-r--r--youtube_dl/extractor/vrt.py9
-rw-r--r--youtube_dl/extractor/wat.py10
-rw-r--r--youtube_dl/extractor/webofstories.py64
-rw-r--r--youtube_dl/extractor/wimp.py2
-rw-r--r--youtube_dl/extractor/wistia.py3
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py14
-rw-r--r--youtube_dl/extractor/xfileshare.py10
-rw-r--r--youtube_dl/extractor/xtube.py98
-rw-r--r--youtube_dl/extractor/yandexmusic.py18
-rw-r--r--youtube_dl/extractor/youjizz.py4
-rw-r--r--youtube_dl/extractor/youku.py4
-rw-r--r--youtube_dl/extractor/youporn.py5
-rw-r--r--youtube_dl/extractor/youtube.py272
-rw-r--r--youtube_dl/extractor/zdf.py4
145 files changed, 4141 insertions, 1410 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index f1e5a5e86..179c11ffa 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -20,9 +20,13 @@ from .aftonbladet import AftonbladetIE
from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
+from .animeondemand import AnimeOnDemandIE
from .anitube import AnitubeIE
from .anysex import AnySexIE
-from .aol import AolIE
+from .aol import (
+ AolIE,
+ AolFeaturesIE,
+)
from .allocine import AllocineIE
from .aparat import AparatIE
from .appleconnect import AppleConnectIE
@@ -44,11 +48,13 @@ from .arte import (
ArteTVFutureIE,
ArteTVCinemaIE,
ArteTVDDCIE,
+ ArteTVMagazineIE,
ArteTVEmbedIE,
)
from .atresplayer import AtresPlayerIE
from .atttechchannel import ATTTechChannelIE
from .audimedia import AudiMediaIE
+from .audioboom import AudioBoomIE
from .audiomack import AudiomackIE, AudiomackAlbumIE
from .azubu import AzubuIE, AzubuLiveIE
from .baidu import BaiduVideoIE
@@ -72,6 +78,7 @@ from .bleacherreport import (
)
from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
+from .bokecc import BokeCCIE
from .bpb import BpbIE
from .br import BRIE
from .breakcom import BreakIE
@@ -89,6 +96,10 @@ from .camdemy import (
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .canvas import CanvasIE
+from .cbc import (
+ CBCIE,
+ CBCPlayerIE,
+)
from .cbs import CBSIE
from .cbsnews import (
CBSNewsIE,
@@ -178,6 +189,10 @@ from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .dropbox import DropboxIE
+from .dw import (
+ DWIE,
+ DWArticleIE,
+)
from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
@@ -202,10 +217,7 @@ from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
from .extremetube import ExtremeTubeIE
-from .facebook import (
- FacebookIE,
- FacebookPostIE,
-)
+from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
from .fczenit import FczenitIE
@@ -333,6 +345,7 @@ from .konserthusetplay import KonserthusetPlayIE
from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
+from .kusi import KUSIIE
from .kuwo import (
KuwoIE,
KuwoAlbumIE,
@@ -345,10 +358,9 @@ from .la7 import LA7IE
from .laola1tv import Laola1TvIE
from .lecture2go import Lecture2GoIE
from .lemonde import LemondeIE
-from .letv import (
- LetvIE,
- LetvTvIE,
- LetvPlaylistIE,
+from .leeco import (
+ LeIE,
+ LePlaylistIE,
LetvCloudIE,
)
from .libsyn import LibsynIE
@@ -377,6 +389,7 @@ from .lynda import (
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
from .mailru import MailRuIE
+from .makerschannel import MakersChannelIE
from .makertv import MakerTVIE
from .malemotion import MalemotionIE
from .matchtv import MatchTVIE
@@ -386,6 +399,7 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE
from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
+from .minoto import MinotoIE
from .miomio import MioMioIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
@@ -485,6 +499,7 @@ from .nowtv import (
NowTVIE,
NowTVListIE,
)
+from .noz import NozIE
from .npo import (
NPOIE,
NPOLiveIE,
@@ -498,6 +513,7 @@ from .npr import NprIE
from .nrk import (
NRKIE,
NRKPlaylistIE,
+ NRKSkoleIE,
NRKTVIE,
)
from .ntvde import NTVDeIE
@@ -548,6 +564,7 @@ from .pornhd import PornHdIE
from .pornhub import (
PornHubIE,
PornHubPlaylistIE,
+ PornHubUserVideosIE,
)
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
@@ -581,6 +598,7 @@ from .regiotv import RegioTVIE
from .restudy import RestudyIE
from .reverbnation import ReverbNationIE
from .revision3 import Revision3IE
+from .rice import RICEIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
@@ -615,6 +633,7 @@ from .sbs import SBSIE
from .scivee import SciVeeIE
from .screencast import ScreencastIE
from .screencastomatic import ScreencastOMaticIE
+from .screenjunkies import ScreenJunkiesIE
from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
from .senateisvp import SenateISVPIE
from .servingsys import ServingSysIE
@@ -660,7 +679,6 @@ from .southpark import (
SouthParkEsIE,
SouthParkNlIE
)
-from .space import SpaceIE
from .spankbang import SpankBangIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
@@ -728,6 +746,7 @@ from .tmz import (
TMZArticleIE,
)
from .tnaflix import (
+ TNAFlixNetworkEmbedIE,
TNAFlixIE,
EMPFlixIE,
MovieFapIE,
@@ -789,7 +808,11 @@ from .twitch import (
TwitchBookmarksIE,
TwitchStreamIE,
)
-from .twitter import TwitterCardIE, TwitterIE
+from .twitter import (
+ TwitterCardIE,
+ TwitterIE,
+ TwitterAmplifyIE,
+)
from .ubu import UbuIE
from .udemy import (
UdemyIE,
@@ -799,7 +822,9 @@ from .udn import UDNEmbedIE
from .digiteka import DigitekaIE
from .unistra import UnistraIE
from .urort import UrortIE
+from .usatoday import USATodayIE
from .ustream import UstreamIE, UstreamChannelIE
+from .ustudio import UstudioIE
from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
@@ -813,7 +838,10 @@ from .vgtv import (
VGTVIE,
)
from .vh1 import VH1IE
-from .vice import ViceIE
+from .vice import (
+ ViceIE,
+ ViceShowIE,
+)
from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
@@ -840,6 +868,7 @@ from .vimeo import (
VimeoChannelIE,
VimeoGroupsIE,
VimeoLikesIE,
+ VimeoOndemandIE,
VimeoReviewIE,
VimeoUserIE,
VimeoWatchLaterIE,
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index 43d7b0523..6018ae79a 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -28,7 +28,7 @@ class AENetworksIE(InfoExtractor):
'info_dict': {
'id': 'eg47EERs_JsZ',
'ext': 'mp4',
- 'title': "Winter Is Coming",
+ 'title': 'Winter Is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
},
'params': {
diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py
new file mode 100644
index 000000000..a7d8daf7b
--- /dev/null
+++ b/youtube_dl/extractor/animeondemand.py
@@ -0,0 +1,160 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ determine_ext,
+ encode_dict,
+ ExtractorError,
+ sanitized_Request,
+ urlencode_postdata,
+)
+
+
+class AnimeOnDemandIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)'
+ _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
+ _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
+ _NETRC_MACHINE = 'animeondemand'
+ _TEST = {
+ 'url': 'https://www.anime-on-demand.de/anime/161',
+ 'info_dict': {
+ 'id': '161',
+ 'title': 'Grimgar, Ashes and Illusions (OmU)',
+ 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
+ },
+ 'playlist_mincount': 4,
+ }
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_form = self._form_hidden_inputs('new_user', login_page)
+
+ login_form.update({
+ 'user[login]': username,
+ 'user[password]': password,
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post url', default=self._LOGIN_URL, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+ request = sanitized_Request(
+ post_url, urlencode_postdata(encode_dict(login_form)))
+ request.add_header('Referer', self._LOGIN_URL)
+
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
+ error = self._search_regex(
+ r'<p class="alert alert-danger">(.+?)</p>',
+ response, 'error', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ anime_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, anime_id)
+
+ if 'data-playlist=' not in webpage:
+ self._download_webpage(
+ self._APPLY_HTML5_URL, anime_id,
+ 'Activating HTML5 beta', 'Unable to apply HTML5 beta')
+ webpage = self._download_webpage(url, anime_id)
+
+ csrf_token = self._html_search_meta(
+ 'csrf-token', webpage, 'csrf token', fatal=True)
+
+ anime_title = self._html_search_regex(
+ r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>',
+ webpage, 'anime name')
+ anime_description = self._html_search_regex(
+ r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
+ webpage, 'anime description', default=None)
+
+ entries = []
+
+ for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage):
+ m = re.search(
+ r'class="episodebox-title"[^>]+title="Episode (?P<number>\d+) - (?P<title>.+?)"', episode_html)
+ if not m:
+ continue
+
+ episode_number = int(m.group('number'))
+ episode_title = m.group('title')
+ video_id = 'episode-%d' % episode_number
+
+ common_info = {
+ 'id': video_id,
+ 'series': anime_title,
+ 'episode': episode_title,
+ 'episode_number': episode_number,
+ }
+
+ formats = []
+
+ playlist_url = self._search_regex(
+ r'data-playlist=(["\'])(?P<url>.+?)\1',
+ episode_html, 'data playlist', default=None, group='url')
+ if playlist_url:
+ request = sanitized_Request(
+ compat_urlparse.urljoin(url, playlist_url),
+ headers={
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'X-CSRF-Token': csrf_token,
+ 'Referer': url,
+ 'Accept': 'application/json, text/javascript, */*; q=0.01',
+ })
+
+ playlist = self._download_json(
+ request, video_id, 'Downloading playlist JSON', fatal=False)
+ if playlist:
+ playlist = playlist['playlist'][0]
+ title = playlist['title']
+ description = playlist.get('description')
+ for source in playlist.get('sources', []):
+ file_ = source.get('file')
+ if file_ and determine_ext(file_) == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ file_, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+
+ if formats:
+ f = common_info.copy()
+ f.update({
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ })
+ entries.append(f)
+
+ m = re.search(
+ r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<',
+ episode_html)
+ if m:
+ f = common_info.copy()
+ f.update({
+ 'id': '%s-teaser' % f['id'],
+ 'title': m.group('title'),
+ 'url': compat_urlparse.urljoin(url, m.group('href')),
+ })
+ entries.append(f)
+
+ return self.playlist_result(entries, anime_id, anime_title, anime_description)
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index b51eafc45..b761b2cc4 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -1,24 +1,11 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class AolIE(InfoExtractor):
IE_NAME = 'on.aol.com'
- _VALID_URL = r'''(?x)
- (?:
- aol-video:|
- http://on\.aol\.com/
- (?:
- video/.*-|
- playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid=
- )
- )
- (?P<id>[0-9]+)
- (?:$|\?)
- '''
+ _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)'
_TESTS = [{
'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
@@ -29,42 +16,31 @@ class AolIE(InfoExtractor):
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
},
'add_ie': ['FiveMin'],
- }, {
- 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316',
- 'info_dict': {
- 'id': '152147',
- 'title': 'Brace Yourself - Today\'s Weirdest News',
- },
- 'playlist_mincount': 10,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- playlist_id = mobj.group('playlist_id')
- if not playlist_id or self._downloader.params.get('noplaylist'):
- return self.url_result('5min:%s' % video_id)
+ video_id = self._match_id(url)
+ return self.url_result('5min:%s' % video_id)
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- webpage = self._download_webpage(url, playlist_id)
- title = self._html_search_regex(
- r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
- playlist_html = self._search_regex(
- r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
- 'playlist HTML')
- entries = [{
- '_type': 'url',
- 'url': 'aol-video:%s' % m.group('id'),
- 'ie_key': 'Aol',
- } for m in re.finditer(
- r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
- playlist_html)]
+class AolFeaturesIE(InfoExtractor):
+ IE_NAME = 'features.aol.com'
+ _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)'
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'display_id': mobj.group('playlist_display_id'),
- 'title': title,
- 'entries': entries,
- }
+ _TESTS = [{
+ 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts',
+ 'md5': '7db483bb0c09c85e241f84a34238cc75',
+ 'info_dict': {
+ 'id': '519507715',
+ 'ext': 'mp4',
+ 'title': 'What To Watch - February 17, 2016',
+ },
+ 'add_ie': ['FiveMin'],
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ return self.url_result(self._search_regex(
+ r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"',
+ webpage, '5min embed url'), 'FiveMin')
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 62ed0c918..be40f85b4 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -12,7 +12,7 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
IE_NAME = 'appletrailers'
- _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TESTS = [{
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
@@ -73,6 +73,9 @@ class AppleTrailersIE(InfoExtractor):
}, {
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
'only_matching': True,
+ }, {
+ 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
+ 'only_matching': True,
}]
_JSON_RE = r'iTunes.playURL\((.*?)\);'
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 6ed855a57..3e119e21b 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -23,7 +23,7 @@ from ..utils import (
class ArteTvIE(InfoExtractor):
- _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
+ _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html'
IE_NAME = 'arte.tv'
def _real_extract(self, url):
@@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor):
class ArteTVPlus7IE(InfoExtractor):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
@classmethod
def _extract_url_info(cls, url):
@@ -102,23 +102,45 @@ class ArteTVPlus7IE(InfoExtractor):
iframe_url = find_iframe_url(webpage, None)
if not iframe_url:
embed_url = self._html_search_regex(
- r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url')
- player = self._download_json(
- embed_url, video_id, 'Downloading player page')
- iframe_url = find_iframe_url(player['html'])
- json_url = compat_parse_qs(
- compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
- return self._extract_from_json_url(json_url, video_id, lang)
-
- def _extract_from_json_url(self, json_url, video_id, lang):
+ r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
+ if embed_url:
+ player = self._download_json(
+ embed_url, video_id, 'Downloading player page')
+ iframe_url = find_iframe_url(player['html'])
+ # en and es URLs produce react-based pages with different layout (e.g.
+ # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
+ if not iframe_url:
+ program = self._search_regex(
+ r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
+ webpage, 'program', default=None)
+ if program:
+ embed_html = self._parse_json(program, video_id)
+ if embed_html:
+ iframe_url = find_iframe_url(embed_html['embed_html'])
+ if iframe_url:
+ json_url = compat_parse_qs(
+ compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
+ if json_url:
+ title = self._search_regex(
+ r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
+ webpage, 'title', default=None, group='title')
+ return self._extract_from_json_url(json_url, video_id, lang, title=title)
+ # Different kind of embed URL (e.g.
+ # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
+ webpage, 'embed url', group='url')
+ return self.url_result(embed_url)
+
+ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
upload_date_str = player_info.get('shootingDate')
if not upload_date_str:
- upload_date_str = player_info.get('VDA', '').split(' ')[0]
+ upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
- title = player_info['VTI'].strip()
+ title = (player_info.get('VTI') or title or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
@@ -132,27 +154,30 @@ class ArteTVPlus7IE(InfoExtractor):
}
qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ'])
+ LANGS = {
+ 'fr': 'F',
+ 'de': 'A',
+ 'en': 'E[ANG]',
+ 'es': 'E[ESP]',
+ }
+
formats = []
for format_id, format_dict in player_info['VSR'].items():
f = dict(format_dict)
versionCode = f.get('versionCode')
-
- langcode = {
- 'fr': 'F',
- 'de': 'A',
- }.get(lang, lang)
- lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode]
- lang_pref = (
- None if versionCode is None else (
- 10 if any(re.match(r, versionCode) for r in lang_rexs)
- else -10))
+ langcode = LANGS.get(lang, lang)
+ lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)]
+ lang_pref = None
+ if versionCode:
+ matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)]
+ lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs)
source_pref = 0
if versionCode is not None:
# The original version with subtitles has lower relevance
- if re.match(r'VO-ST(F|A)', versionCode):
+ if re.match(r'VO-ST(F|A|E)', versionCode):
source_pref -= 10
# The version with sourds/mal subtitles has also lower relevance
- elif re.match(r'VO?(F|A)-STM\1', versionCode):
+ elif re.match(r'VO?(F|A|E)-STM\1', versionCode):
source_pref -= 9
format = {
'format_id': format_id,
@@ -185,7 +210,7 @@ class ArteTVPlus7IE(InfoExtractor):
# It also uses the arte_vp_url url from the webpage to extract the information
class ArteTVCreativeIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:creative'
- _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/(?:magazine?/)?(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:magazine?/)?(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
@@ -209,7 +234,7 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
class ArteTVFutureIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:future'
- _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(?P<id>.+)'
+ _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses',
@@ -217,6 +242,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
'id': '050940-028-A',
'ext': 'mp4',
'title': 'Les écrevisses aussi peuvent être anxieuses',
+ 'upload_date': '20140902',
},
}, {
'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable',
@@ -226,7 +252,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
class ArteTVDDCIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:ddc'
- _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+ _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
def _real_extract(self, url):
video_id, lang = self._extract_url_info(url)
@@ -244,7 +270,7 @@ class ArteTVDDCIE(ArteTVPlus7IE):
class ArteTVConcertIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:concert'
- _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)'
+ _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
@@ -261,7 +287,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):
class ArteTVCinemaIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:cinema'
- _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)'
+ _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
_TEST = {
'url': 'http://cinema.arte.tv/de/node/38291',
@@ -276,6 +302,37 @@ class ArteTVCinemaIE(ArteTVPlus7IE):
}
+class ArteTVMagazineIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:magazine'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..."
+ 'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium',
+ 'md5': '2a9369bcccf847d1c741e51416299f25',
+ 'info_dict': {
+ 'id': '065965-000-A',
+ 'ext': 'mp4',
+ 'title': 'Trepalium - Extrait Ep.01',
+ 'upload_date': '20160121',
+ },
+ }, {
+ # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium"
+ 'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium',
+ 'md5': 'fedc64fc7a946110fe311634e79782ca',
+ 'info_dict': {
+ 'id': '054813-004_PLUS7-F',
+ 'ext': 'mp4',
+ 'title': 'Trepalium (4/6)',
+ 'description': 'md5:10057003c34d54e95350be4f9b05cb40',
+ 'upload_date': '20160218',
+ },
+ }, {
+ 'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis',
+ 'only_matching': True,
+ }]
+
+
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py
index 3b2effa15..aa6925623 100644
--- a/youtube_dl/extractor/audimedia.py
+++ b/youtube_dl/extractor/audimedia.py
@@ -10,9 +10,9 @@ from ..utils import (
class AudiMediaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)'
_TEST = {
- 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test',
+ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
'md5': '79a8b71c46d49042609795ab59779b66',
'info_dict': {
'id': '1565',
@@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload')
+ raw_payload = self._search_regex([
+ r'class="amtv-embed"[^>]+id="([^"]+)"',
+ r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"',
+ ], webpage, 'raw payload')
_, stage_mode, video_id, lang = raw_payload.split('-')
# TODO: handle s and e stage_mode (live streams and ended live streams)
@@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor):
video_version_url = video_version.get('download_url') or video_version.get('stream_url')
if not video_version_url:
continue
- formats.append({
+ f = {
'url': video_version_url,
'width': int_or_none(video_version.get('width')),
'height': int_or_none(video_version.get('height')),
'abr': int_or_none(video_version.get('audio_bitrate')),
'vbr': int_or_none(video_version.get('video_bitrate')),
- })
+ }
+ bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
+ if bitrate:
+ f.update({
+ 'format_id': 'http-%s' % bitrate,
+ })
+ formats.append(f)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
new file mode 100644
index 000000000..2ec2d7092
--- /dev/null
+++ b/youtube_dl/extractor/audioboom.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AudioBoomIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0',
+ 'md5': '63a8d73a055c6ed0f1e51921a10a5a76',
+ 'info_dict': {
+ 'id': '4279833',
+ 'ext': 'mp3',
+ 'title': '3/09/2016 Czaban Hour 3',
+ 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
+ 'duration': 2245.72,
+ 'uploader': 'Steve Czaban',
+ 'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ clip = None
+
+ clip_store = self._parse_json(
+ self._search_regex(
+ r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id,
+ webpage, 'clip store', default='{}', group='json'),
+ video_id, fatal=False)
+ if clip_store:
+ clips = clip_store.get('clips')
+ if clips and isinstance(clips, list) and isinstance(clips[0], dict):
+ clip = clips[0]
+
+ def from_clip(field):
+ if clip:
+ clip.get(field)
+
+ audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
+ 'audio', webpage, 'audio url')
+ title = from_clip('title') or self._og_search_title(webpage)
+ description = from_clip('description') or self._og_search_description(webpage)
+
+ duration = float_or_none(from_clip('duration') or self._html_search_meta(
+ 'weibo:audio:duration', webpage))
+
+ uploader = from_clip('author') or self._og_search_property(
+ 'audio:artist', webpage, 'uploader', fatal=False)
+ uploader_url = from_clip('author_url') or self._html_search_meta(
+ 'audioboo:channel', webpage, 'uploader url')
+
+ return {
+ 'id': video_id,
+ 'url': audio_url,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_url': uploader_url,
+ }
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 6ddee686c..e62b3860e 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -10,7 +10,6 @@ from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
- remove_end,
unescapeHTML,
)
from ..compat import (
@@ -86,7 +85,7 @@ class BBCCoUkIE(InfoExtractor):
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Voice UK: Series 3: Blind Auditions 5',
- 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
+ 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
'duration': 5100,
},
'params': {
@@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
- 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ 'title': 'BUGGER',
},
'playlist_count': 18,
}, {
@@ -670,10 +669,18 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
- 'title': 'What Liverpool can expect from Klopp',
+ 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
},
'playlist_count': 3,
}, {
+ # school report article with single video
+ 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+ 'info_dict': {
+ 'id': '35744779',
+ 'title': 'School which breaks down barriers in Jerusalem',
+ },
+ 'playlist_count': 1,
+ }, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
'only_matching': True,
@@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE):
json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
timestamp = json_ld_info.get('timestamp')
+
playlist_title = json_ld_info.get('title')
- playlist_description = json_ld_info.get('description')
+ if not playlist_title:
+ playlist_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ if playlist_title:
+ playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+ playlist_description = json_ld_info.get(
+ 'description') or self._og_search_description(webpage, default=None)
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
@@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE):
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
if entries:
- playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
- playlist_description = playlist_description or self._og_search_description(webpage, default=None)
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
@@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
- playlist_title = self._html_search_regex(
- r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
- playlist_description = self._og_search_description(webpage, default=None)
-
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 38bda3af5..7a8e1f60b 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor):
'add_ie': ['Ooyala'],
}, {
'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
- 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50',
+ 'md5': '6a5cd403418c7b01719248ca97fb0692',
'info_dict': {
'id': '2586817',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
@@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE):
'md5': '8c2c12e3af7805152675446c905d159b',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py
new file mode 100644
index 000000000..122a1cbb6
--- /dev/null
+++ b/youtube_dl/extractor/bokecc.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import ExtractorError
+
+
+class BokeCCBaseIE(InfoExtractor):
+ def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
+ player_params_str = self._html_search_regex(
+ r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
+ webpage, 'player params')
+
+ player_params = compat_parse_qs(player_params_str)
+
+ info_xml = self._download_xml(
+ 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
+ player_params['siteid'][0], player_params['vid'][0]), video_id)
+
+ formats = [{
+ 'format_id': format_id,
+ 'url': quality.find('./copy').attrib['playurl'],
+ 'preference': int(quality.attrib['value']),
+ } for quality in info_xml.findall('./video/quality')]
+
+ self._sort_formats(formats)
+
+ return formats
+
+
+class BokeCCIE(BokeCCBaseIE):
+ _IE_DESC = 'CC视频'
+ _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
+
+ _TESTS = [{
+ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B',
+ 'info_dict': {
+ 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30',
+ 'ext': 'flv',
+ 'title': 'BokeCC Video',
+ },
+ }]
+
+ def _real_extract(self, url):
+ qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ if not qs.get('vid') or not qs.get('uid'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
+
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': 'BokeCC Video', # no title provided in the webpage
+ 'formats': self._extract_bokecc_formats(webpage, video_id),
+ }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index c947337f9..f8413d5f2 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -13,6 +13,7 @@ from ..compat import (
compat_urllib_parse_urlparse,
compat_urlparse,
compat_xml_parse_error,
+ compat_HTTPError,
)
from ..utils import (
determine_ext,
@@ -355,7 +356,7 @@ class BrightcoveLegacyIE(InfoExtractor):
class BrightcoveNewIE(InfoExtractor):
IE_NAME = 'brightcove:new'
- _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)'
+ _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
'md5': 'c8100925723840d4b0d243f7025703be',
@@ -391,6 +392,10 @@ class BrightcoveNewIE(InfoExtractor):
# ref: prefixed video id
'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
'only_matching': True,
+ }, {
+ # non numeric ref: prefixed video id
+ 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
+ 'only_matching': True,
}]
@staticmethod
@@ -424,7 +429,7 @@ class BrightcoveNewIE(InfoExtractor):
</video>.*?
<script[^>]+
src=["\'](?:https?:)?//players\.brightcove\.net/
- (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js
+ (\d+)/([\da-f-]+)_([^/]+)/index(?:\.min)?\.js
''', webpage):
entries.append(
'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
@@ -458,15 +463,22 @@ class BrightcoveNewIE(InfoExtractor):
'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s'
% (account_id, video_id),
headers={'Accept': 'application/json;pk=%s' % policy_key})
- json_data = self._download_json(req, video_id)
+ try:
+ json_data = self._download_json(req, video_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ json_data = self._parse_json(e.cause.read().decode(), video_id)
+ raise ExtractorError(json_data[0]['message'], expected=True)
+ raise
title = json_data['name']
formats = []
for source in json_data.get('sources', []):
+ container = source.get('container')
source_type = source.get('type')
src = source.get('src')
- if source_type == 'application/x-mpegURL':
+ if source_type == 'application/x-mpegURL' or container == 'M2TS':
if not src:
continue
formats.extend(self._extract_m3u8_formats(
@@ -484,7 +496,7 @@ class BrightcoveNewIE(InfoExtractor):
'width': int_or_none(source.get('width')),
'height': height,
'filesize': int_or_none(source.get('size')),
- 'container': source.get('container'),
+ 'container': container,
'vcodec': source.get('codec'),
'ext': source.get('container').lower(),
}
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
index cb96c3876..cac8fdcba 100644
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@@ -4,12 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import js_to_json
class C56IE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
@@ -18,12 +19,29 @@ class C56IE(InfoExtractor):
'title': '网事知多少 第32期:车怒',
'duration': 283.813,
},
- }
+ }, {
+ 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '82247482',
+ 'title': '爱的诅咒之杜鹃花开',
+ },
+ 'playlist_count': 7,
+ 'add_ie': ['Sohu'],
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid')
+ webpage = self._download_webpage(url, text_id)
+ sohu_video_info_str = self._search_regex(
+ r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
+ if sohu_video_info_str:
+ sohu_video_info = self._parse_json(
+ sohu_video_info_str, text_id, transform_source=js_to_json)
+ return self.url_result(sohu_video_info['url'], 'Sohu')
+
page = self._download_json(
'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py
index ee19ff836..ec6d24d96 100644
--- a/youtube_dl/extractor/canvas.py
+++ b/youtube_dl/extractor/canvas.py
@@ -6,7 +6,7 @@ from ..utils import float_or_none
class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P<id>[^/?#&]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
'md5': 'ea838375a547ac787d4064d8c7860a6c',
'info_dict': {
@@ -18,7 +18,27 @@ class CanvasIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 49.02,
}
- }
+ }, {
+ # with subtitles
+ 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
+ 'info_dict': {
+ 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
+ 'display_id': 'pieter-0167',
+ 'ext': 'mp4',
+ 'title': 'Pieter 0167',
+ 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 2553.08,
+ 'subtitles': {
+ 'nl': [{
+ 'ext': 'vtt',
+ }],
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -54,6 +74,14 @@ class CanvasIE(InfoExtractor):
})
self._sort_formats(formats)
+ subtitles = {}
+ subtitle_urls = data.get('subtitleUrls')
+ if isinstance(subtitle_urls, list):
+ for subtitle in subtitle_urls:
+ subtitle_url = subtitle.get('url')
+ if subtitle_url and subtitle.get('type') == 'CLOSED':
+ subtitles.setdefault('nl', []).append({'url': subtitle_url})
+
return {
'id': video_id,
'display_id': display_id,
@@ -62,4 +90,5 @@ class CanvasIE(InfoExtractor):
'formats': formats,
'duration': float_or_none(data.get('duration'), 1000),
'thumbnail': data.get('posterImageUrl'),
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
new file mode 100644
index 000000000..d8aa31038
--- /dev/null
+++ b/youtube_dl/extractor/cbc.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class CBCIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)'
+ _TESTS = [{
+ # with mediaId
+ 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
+ 'info_dict': {
+ 'id': '2682904050',
+ 'ext': 'flv',
+ 'title': 'Don Cherry – All-Stars',
+ 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
+ 'timestamp': 1454475540,
+ 'upload_date': '20160203',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # with clipId
+ 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
+ 'info_dict': {
+ 'id': '2487345465',
+ 'ext': 'flv',
+ 'title': 'Robin Williams freestyles on 90 Minutes Live',
+ 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
+ 'upload_date': '19700101',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # multiple iframes
+ 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '2680832926',
+ 'ext': 'flv',
+ 'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
+ 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
+ 'upload_date': '19700101',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '2658915080',
+ 'ext': 'flv',
+ 'title': 'Fly like an eagle!',
+ 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
+ 'upload_date': '19700101',
+ },
+ }],
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ player_init = self._search_regex(
+ r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage, 'player init',
+ default=None)
+ if player_init:
+ player_info = self._parse_json(player_init, display_id, js_to_json)
+ media_id = player_info.get('mediaId')
+ if not media_id:
+ clip_id = player_info['clipId']
+ media_id = self._download_json(
+ 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id,
+ clip_id)['entries'][0]['id'].split('/')[-1]
+ return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
+ else:
+ entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)]
+ return self.playlist_result(entries)
+
+
+class CBCPlayerIE(InfoExtractor):
+ _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.cbc.ca/player/play/2683190193',
+ 'info_dict': {
+ 'id': '2683190193',
+ 'ext': 'flv',
+ 'title': 'Gerry Runs a Sweat Shop',
+ 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
+ 'timestamp': 1455067800,
+ 'upload_date': '20160210',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self.url_result(
+ 'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id,
+ 'ThePlatformFeed', video_id)
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 8f864699f..7319ee1b7 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from .theplatform import ThePlatformIE
-from ..utils import parse_duration
+from ..utils import (
+ parse_duration,
+ find_xpath_attr,
+)
class CBSNewsIE(ThePlatformIE):
@@ -46,6 +49,15 @@ class CBSNewsIE(ThePlatformIE):
},
]
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
+ return {
+ 'en': [{
+ 'ext': 'ttml',
+ 'url': closed_caption_e.attrib['value'],
+ }]
+ } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -61,12 +73,6 @@ class CBSNewsIE(ThePlatformIE):
thumbnail = item.get('mediaImage') or item.get('thumbnail')
subtitles = {}
- if 'mpxRefId' in video_info:
- subtitles['en'] = [{
- 'ext': 'ttml',
- 'url': 'http://www.cbsnews.com/videos/captions/%s.adb_xml' % video_info['mpxRefId'],
- }]
-
formats = []
for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
pid = item.get('media' + format_id)
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py
index e94b1e35b..dda2c0959 100644
--- a/youtube_dl/extractor/ccc.py
+++ b/youtube_dl/extractor/ccc.py
@@ -45,7 +45,7 @@ class CCCIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
- r"(?s)<h3>About</h3>(.+?)<h3>",
+ r'(?s)<h3>About</h3>(.+?)<h3>',
webpage, 'description', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
r"(?s)<span[^>]+class='[^']*fa-calendar-o'[^>]*>(.+?)</span>",
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index 6f7b2a70d..b27b4e670 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -177,16 +177,16 @@ class CeskaTelevizeIE(InfoExtractor):
for divider in [1000, 60, 60, 100]:
components.append(msec % divider)
msec //= divider
- return "{3:02}:{2:02}:{1:02},{0:03}".format(*components)
+ return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
def _fix_subtitle(subtitle):
for line in subtitle.splitlines():
- m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line)
+ m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
if m:
yield m.group(1)
start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
- yield "{0} --> {1}".format(start, stop)
+ yield '{0} --> {1}'.format(start, stop)
else:
yield line
- return "\r\n".join(_fix_subtitle(subtitles))
+ return '\r\n'.join(_fix_subtitle(subtitles))
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index 6d9cd8abd..042c4f2f1 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -21,6 +21,10 @@ class CinemassacreIE(InfoExtractor):
'title': '“Angry Video Game Nerd: The Movie” – Trailer',
'description': 'md5:fb87405fcb42a331742a0dce2708560b',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
@@ -31,14 +35,18 @@ class CinemassacreIE(InfoExtractor):
'upload_date': '20131002',
'title': 'The Mummy’s Hand (1940)',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
# Youtube embedded video
'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
- 'md5': 'df4cf8a1dcedaec79a73d96d83b99023',
+ 'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9',
'info_dict': {
'id': 'OEVzPCY2T-g',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
'upload_date': '20061207',
'uploader': 'Cinemassacre',
@@ -49,12 +57,12 @@ class CinemassacreIE(InfoExtractor):
{
# Youtube embedded video
'url': 'http://cinemassacre.com/2006/09/01/mckids/',
- 'md5': '6eb30961fa795fedc750eac4881ad2e1',
+ 'md5': '7393c4e0f54602ad110c793eb7a6513a',
'info_dict': {
'id': 'FnxsNhuikpo',
- 'ext': 'mp4',
+ 'ext': 'webm',
'upload_date': '20060901',
- 'uploader': 'Cinemassacre Extras',
+ 'uploader': 'Cinemassacre Extra',
'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
'uploader_id': 'Cinemassacre',
'title': 'AVGN: McKids',
@@ -69,7 +77,11 @@ class CinemassacreIE(InfoExtractor):
'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
'upload_date': '20150525',
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}
]
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 5c3908f72..3cf0bf95b 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -51,9 +51,7 @@ class CNETIE(ThePlatformIE):
uploader = None
uploader_id = None
- mpx_account = data['config']['uvpConfig']['default']['mpx_account']
-
- metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id)
+ metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id)
description = vdata.get('description') or metadata.get('description')
duration = int_or_none(vdata.get('duration')) or metadata.get('duration')
@@ -62,7 +60,7 @@ class CNETIE(ThePlatformIE):
for (fkey, vid) in vdata['files'].items():
if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
continue
- release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid)
+ release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid
if fkey == 'hds':
release_url += '&manifest=f4m'
tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey)
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 3b1bd4033..53489a14e 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -26,14 +26,14 @@ class CNNIE(InfoExtractor):
'upload_date': '20130609',
},
}, {
- "url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
- "md5": "b5cc60c60a3477d185af8f19a2a26f4e",
- "info_dict": {
+ 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29',
+ 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e',
+ 'info_dict': {
'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
'ext': 'mp4',
- "title": "Student's epic speech stuns new freshmen",
- "description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
- "upload_date": "20130821",
+ 'title': "Student's epic speech stuns new freshmen",
+ 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
+ 'upload_date': '20130821',
}
}, {
'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
index 40667a0f1..f9e84193d 100644
--- a/youtube_dl/extractor/collegerama.py
+++ b/youtube_dl/extractor/collegerama.py
@@ -46,9 +46,9 @@ class CollegeRamaIE(InfoExtractor):
video_id = self._match_id(url)
player_options_request = {
- "getPlayerOptionsRequest": {
- "ResourceId": video_id,
- "QueryString": "",
+ 'getPlayerOptionsRequest': {
+ 'ResourceId': video_id,
+ 'QueryString': '',
}
}
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 3e4bd10b6..5b1b99675 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -16,11 +16,11 @@ from ..utils import (
class ComedyCentralIE(MTVServicesInfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
- (video-clips|episodes|cc-studios|video-collections|full-episodes)
+ (video-clips|episodes|cc-studios|video-collections|full-episodes|shows)
/(?P<title>.*)'''
_FEED_URL = 'http://comedycentral.com/feeds/mrss/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
'info_dict': {
@@ -29,7 +29,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother',
'description': 'After a certain point, breastfeeding becomes c**kblocking.',
},
- }
+ }, {
+ 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
+ 'only_matching': True,
+ }]
class ComedyCentralShowsIE(MTVServicesInfoExtractor):
@@ -192,7 +195,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
if len(altMovieParams) == 0:
raise ExtractorError('unable to find Flash URL in webpage ' + url)
else:
- mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
+ mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])]
uri = mMovieParams[0][1]
# Correct cc.com in uri
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index cd7087bec..770105a5b 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -15,13 +15,14 @@ import math
from ..compat import (
compat_cookiejar,
compat_cookies,
+ compat_etree_fromstring,
compat_getpass,
compat_http_client,
+ compat_os_name,
+ compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urlparse,
- compat_str,
- compat_etree_fromstring,
)
from ..utils import (
NO_DEFAULT,
@@ -46,6 +47,8 @@ from ..utils import (
xpath_with_ns,
determine_protocol,
parse_duration,
+ mimetype2ext,
+ update_url_query,
)
@@ -103,7 +106,7 @@ class InfoExtractor(object):
* protocol The protocol that will be used for the actual
download, lower-case.
"http", "https", "rtsp", "rtmp", "rtmpe",
- "m3u8", or "m3u8_native".
+ "m3u8", "m3u8_native" or "http_dash_segments".
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@@ -156,12 +159,14 @@ class InfoExtractor(object):
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
+ license: License name the video is licensed under.
creator: The main artist who created the video.
release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
+ uploader_url: Full URL to a personal webpage of the video uploader.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{language: subformats}. "subformats" is a list sorted from
@@ -341,7 +346,7 @@ class InfoExtractor(object):
def IE_NAME(self):
return compat_str(type(self).__name__[:-2])
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
""" Returns the response handle """
if note is None:
self.report_download_webpage(video_id)
@@ -350,6 +355,12 @@ class InfoExtractor(object):
self.to_screen('%s' % (note,))
else:
self.to_screen('%s: %s' % (video_id, note))
+ # data, headers and query params will be ignored for `Request` objects
+ if isinstance(url_or_request, compat_str):
+ if query:
+ url_or_request = update_url_query(url_or_request, query)
+ if data or headers:
+ url_or_request = sanitized_Request(url_or_request, data, headers or {})
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -365,13 +376,13 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
if urlh is False:
assert not fatal
return False
@@ -424,7 +435,7 @@ class InfoExtractor(object):
self.to_screen('Saving request to ' + filename)
# Working around MAX_PATH limitation on Windows (see
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
- if os.name == 'nt':
+ if compat_os_name == 'nt':
absfilepath = os.path.abspath(filename)
if len(absfilepath) > 259:
filename = '\\\\?\\' + absfilepath
@@ -458,13 +469,13 @@ class InfoExtractor(object):
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
""" Returns the data of the page as a string """
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -479,10 +490,10 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None):
+ transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
+ url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
if xml_string is False:
return xml_string
if transform_source:
@@ -493,10 +504,10 @@ class InfoExtractor(object):
note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
transform_source=None,
- fatal=True, encoding=None):
+ fatal=True, encoding=None, data=None, headers=None, query=None):
json_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding)
+ encoding=encoding, data=data, headers=headers, query=query)
if (not fatal) and json_string is False:
return None
return self._parse_json(
@@ -593,7 +604,7 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
+ if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -636,7 +647,7 @@ class InfoExtractor(object):
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
- if downloader_params.get('username', None) is not None:
+ if downloader_params.get('username') is not None:
username = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
@@ -663,7 +674,7 @@ class InfoExtractor(object):
return None
downloader_params = self._downloader.params
- if downloader_params.get('twofactor', None) is not None:
+ if downloader_params.get('twofactor') is not None:
return downloader_params['twofactor']
return compat_getpass('Type %s and press [Return]: ' % note)
@@ -744,7 +755,7 @@ class InfoExtractor(object):
'mature': 17,
'restricted': 19,
}
- return RATING_TABLE.get(rating.lower(), None)
+ return RATING_TABLE.get(rating.lower())
def _family_friendly_search(self, html):
# See http://schema.org/VideoObject
@@ -759,7 +770,7 @@ class InfoExtractor(object):
'0': 18,
'false': 18,
}
- return RATING_TABLE.get(family_friendly.lower(), None)
+ return RATING_TABLE.get(family_friendly.lower())
def _twitter_search_player(self, html):
return self._html_search_meta('twitter:player', html,
@@ -902,6 +913,16 @@ class InfoExtractor(object):
item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
formats)
+ @staticmethod
+ def _remove_duplicate_formats(formats):
+ format_urls = set()
+ unique_formats = []
+ for f in formats:
+ if f['url'] not in format_urls:
+ format_urls.add(f['url'])
+ unique_formats.append(f)
+ formats[:] = unique_formats
+
def _is_valid_url(self, url, video_id, item='video'):
url = self._proto_relative_url(url, scheme='http:')
# For now assume non HTTP(S) URLs always valid
@@ -955,6 +976,13 @@ class InfoExtractor(object):
if manifest is False:
return []
+ return self._parse_f4m_formats(
+ manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ transform_source=transform_source, fatal=fatal)
+
+ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
formats = []
manifest_version = '1.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
@@ -980,7 +1008,8 @@ class InfoExtractor(object):
# bitrate in f4m downloader
if determine_ext(manifest_url) == 'f4m':
formats.extend(self._extract_f4m_formats(
- manifest_url, video_id, preference, f4m_id, fatal=fatal))
+ manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ transform_source=transform_source, fatal=fatal))
continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
@@ -1025,11 +1054,21 @@ class InfoExtractor(object):
return []
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
- # A Media Playlist Tag MUST NOT appear in a Master Playlist
- # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
- # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
- if '#EXT-X-TARGETDURATION' in m3u8_doc:
+
+ # We should try extracting formats only from master playlists [1], i.e.
+ # playlists that describe available qualities. On the other hand media
+ # playlists [2] should be returned as is since they contain just the media
+ # without qualities renditions.
+ # Fortunately, master playlist can be easily distinguished from media
+ # playlist based on particular tags availability. As of [1, 2] master
+ # playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
+ # and MUST NOT appear in master playlist thus we can clearly detect media
+ # playlist with this criterion.
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
+ # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
+ # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
'format_id': m3u8_id,
@@ -1076,19 +1115,29 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}
- codecs = last_info.get('CODECS')
- if codecs:
- # TODO: looks like video codec is not always necessarily goes first
- va_codecs = codecs.split(',')
- if va_codecs[0]:
- f['vcodec'] = va_codecs[0]
- if len(va_codecs) > 1 and va_codecs[1]:
- f['acodec'] = va_codecs[1]
resolution = last_info.get('RESOLUTION')
if resolution:
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
+ codecs = last_info.get('CODECS')
+ if codecs:
+ vcodec, acodec = [None] * 2
+ va_codecs = codecs.split(',')
+ if len(va_codecs) == 1:
+ # Audio only entries usually come with single codec and
+ # no resolution. For more robustness we also check it to
+ # be mp4 audio.
+ if not resolution and va_codecs[0].startswith('mp4a'):
+ vcodec, acodec = 'none', va_codecs[0]
+ else:
+ vcodec = va_codecs[0]
+ else:
+ vcodec, acodec = va_codecs[:2]
+ f.update({
+ 'acodec': acodec,
+ 'vcodec': vcodec,
+ })
if last_media is not None:
f['m3u8_media'] = last_media
last_media = None
@@ -1109,8 +1158,8 @@ class InfoExtractor(object):
out.append('{%s}%s' % (namespace, c))
return '/'.join(out)
- def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
- smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if smil is False:
assert not fatal
@@ -1127,10 +1176,10 @@ class InfoExtractor(object):
return {}
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
- def _download_smil(self, smil_url, video_id, fatal=True):
+ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
return self._download_xml(
smil_url, video_id, 'Downloading SMIL file',
- 'Unable to download SMIL file', fatal=fatal)
+ 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
@@ -1189,12 +1238,13 @@ class InfoExtractor(object):
http_count = 0
m3u8_count = 0
- src_urls = []
+ srcs = []
videos = smil.findall(self._xpath_ns('.//video', namespace))
for video in videos:
src = video.get('src')
- if not src:
+ if not src or src in srcs:
continue
+ srcs.append(src)
bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
filesize = int_or_none(video.get('size') or video.get('fileSize'))
@@ -1226,9 +1276,7 @@ class InfoExtractor(object):
continue
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
- if src_url in src_urls:
- continue
- src_urls.append(src_url)
+ src_url = src_url.strip()
if proto == 'm3u8' or src_ext == 'm3u8':
m3u8_formats = self._extract_m3u8_formats(
@@ -1281,16 +1329,7 @@ class InfoExtractor(object):
if not src or src in urls:
continue
urls.append(src)
- ext = textstream.get('ext') or determine_ext(src)
- if not ext:
- type_ = textstream.get('type')
- SUBTITLES_TYPES = {
- 'text/vtt': 'vtt',
- 'text/srt': 'srt',
- 'application/smptett+xml': 'tt',
- }
- if type_ in SUBTITLES_TYPES:
- ext = SUBTITLES_TYPES[type_]
+ ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
subtitles.setdefault(lang, []).append({
'url': src,
@@ -1426,8 +1465,9 @@ class InfoExtractor(object):
continue
representation_attrib = adaptation_set.attrib.copy()
representation_attrib.update(representation.attrib)
- mime_type = representation_attrib.get('mimeType')
- content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
+ # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+ mime_type = representation_attrib['mimeType']
+ content_type = mime_type.split('/')[0]
if content_type == 'text':
# TODO implement WebVTT downloading
pass
@@ -1439,7 +1479,9 @@ class InfoExtractor(object):
base_url = base_url_e.text + base_url
if re.match(r'^https?://', base_url):
break
- if not re.match(r'^https?://', base_url):
+ if mpd_base_url and not re.match(r'^https?://', base_url):
+ if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+ mpd_base_url += '/'
base_url = mpd_base_url + base_url
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
@@ -1448,6 +1490,7 @@ class InfoExtractor(object):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'url': base_url,
+ 'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
@@ -1499,7 +1542,7 @@ class InfoExtractor(object):
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
- now_str = now.strftime("%Y-%m-%d %H:%M")
+ now_str = now.strftime('%Y-%m-%d %H:%M')
return name + ' ' + now_str
def _int(self, v, name, fatal=False, **kwargs):
@@ -1572,7 +1615,7 @@ class InfoExtractor(object):
return {}
def _get_subtitles(self, *args, **kwargs):
- raise NotImplementedError("This method must be implemented by subclasses")
+ raise NotImplementedError('This method must be implemented by subclasses')
@staticmethod
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
@@ -1598,7 +1641,16 @@ class InfoExtractor(object):
return {}
def _get_automatic_captions(self, *args, **kwargs):
- raise NotImplementedError("This method must be implemented by subclasses")
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def mark_watched(self, *args, **kwargs):
+ if (self._downloader.params.get('mark_watched', False) and
+ (self._get_login_info()[0] is not None or
+ self._downloader.params.get('cookiefile') is not None)):
+ self._mark_watched(*args, **kwargs)
+
+ def _mark_watched(self, *args, **kwargs):
+ raise NotImplementedError('This method must be implemented by subclasses')
class SearchInfoExtractor(InfoExtractor):
@@ -1638,7 +1690,7 @@ class SearchInfoExtractor(InfoExtractor):
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
- raise NotImplementedError("This method must be implemented by subclasses")
+ raise NotImplementedError('This method must be implemented by subclasses')
@property
def SEARCH_KEY(self):
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 785594df8..c7032ffa2 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -180,40 +180,40 @@ class CrunchyrollIE(CrunchyrollBaseIE):
return assvalue
output = '[Script Info]\n'
- output += 'Title: %s\n' % sub_root.attrib["title"]
+ output += 'Title: %s\n' % sub_root.attrib['title']
output += 'ScriptType: v4.00+\n'
- output += 'WrapStyle: %s\n' % sub_root.attrib["wrap_style"]
- output += 'PlayResX: %s\n' % sub_root.attrib["play_res_x"]
- output += 'PlayResY: %s\n' % sub_root.attrib["play_res_y"]
+ output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style']
+ output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x']
+ output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y']
output += """ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
"""
for style in sub_root.findall('./styles/style'):
- output += 'Style: ' + style.attrib["name"]
- output += ',' + style.attrib["font_name"]
- output += ',' + style.attrib["font_size"]
- output += ',' + style.attrib["primary_colour"]
- output += ',' + style.attrib["secondary_colour"]
- output += ',' + style.attrib["outline_colour"]
- output += ',' + style.attrib["back_colour"]
- output += ',' + ass_bool(style.attrib["bold"])
- output += ',' + ass_bool(style.attrib["italic"])
- output += ',' + ass_bool(style.attrib["underline"])
- output += ',' + ass_bool(style.attrib["strikeout"])
- output += ',' + style.attrib["scale_x"]
- output += ',' + style.attrib["scale_y"]
- output += ',' + style.attrib["spacing"]
- output += ',' + style.attrib["angle"]
- output += ',' + style.attrib["border_style"]
- output += ',' + style.attrib["outline"]
- output += ',' + style.attrib["shadow"]
- output += ',' + style.attrib["alignment"]
- output += ',' + style.attrib["margin_l"]
- output += ',' + style.attrib["margin_r"]
- output += ',' + style.attrib["margin_v"]
- output += ',' + style.attrib["encoding"]
+ output += 'Style: ' + style.attrib['name']
+ output += ',' + style.attrib['font_name']
+ output += ',' + style.attrib['font_size']
+ output += ',' + style.attrib['primary_colour']
+ output += ',' + style.attrib['secondary_colour']
+ output += ',' + style.attrib['outline_colour']
+ output += ',' + style.attrib['back_colour']
+ output += ',' + ass_bool(style.attrib['bold'])
+ output += ',' + ass_bool(style.attrib['italic'])
+ output += ',' + ass_bool(style.attrib['underline'])
+ output += ',' + ass_bool(style.attrib['strikeout'])
+ output += ',' + style.attrib['scale_x']
+ output += ',' + style.attrib['scale_y']
+ output += ',' + style.attrib['spacing']
+ output += ',' + style.attrib['angle']
+ output += ',' + style.attrib['border_style']
+ output += ',' + style.attrib['outline']
+ output += ',' + style.attrib['shadow']
+ output += ',' + style.attrib['alignment']
+ output += ',' + style.attrib['margin_l']
+ output += ',' + style.attrib['margin_r']
+ output += ',' + style.attrib['margin_v']
+ output += ',' + style.attrib['encoding']
output += '\n'
output += """
@@ -222,15 +222,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
for event in sub_root.findall('./events/event'):
output += 'Dialogue: 0'
- output += ',' + event.attrib["start"]
- output += ',' + event.attrib["end"]
- output += ',' + event.attrib["style"]
- output += ',' + event.attrib["name"]
- output += ',' + event.attrib["margin_l"]
- output += ',' + event.attrib["margin_r"]
- output += ',' + event.attrib["margin_v"]
- output += ',' + event.attrib["effect"]
- output += ',' + event.attrib["text"]
+ output += ',' + event.attrib['start']
+ output += ',' + event.attrib['end']
+ output += ',' + event.attrib['style']
+ output += ',' + event.attrib['name']
+ output += ',' + event.attrib['margin_l']
+ output += ',' + event.attrib['margin_r']
+ output += ',' + event.attrib['margin_v']
+ output += ',' + event.attrib['effect']
+ output += ',' + event.attrib['text']
output += '\n'
return output
@@ -376,7 +376,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
- IE_NAME = "crunchyroll:playlist"
+ IE_NAME = 'crunchyroll:playlist'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 6e462af69..2e6226ea0 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -122,10 +122,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
description = self._og_search_description(webpage) or self._html_search_meta(
'description', webpage, 'description')
- view_count = str_to_int(self._search_regex(
- [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"',
- r'video_views_count[^>]+>\s+([\d\.,]+)'],
- webpage, 'view count', fatal=False))
+ view_count_str = self._search_regex(
+ (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
+ r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
+ webpage, 'view count', fatal=False)
+ if view_count_str:
+ view_count_str = re.sub(r'\s', '', view_count_str)
+ view_count = str_to_int(view_count_str)
comment_count = int_or_none(self._search_regex(
r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
webpage, 'comment count', fatal=False))
@@ -396,13 +399,13 @@ class DailymotionCloudIE(DailymotionBaseInfoExtractor):
}]
@classmethod
- def _extract_dmcloud_url(self, webpage):
- mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage)
+ def _extract_dmcloud_url(cls, webpage):
+ mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage)
if mobj:
return mobj.group(1)
mobj = re.search(
- r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL,
+ r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL,
webpage)
if mobj:
return mobj.group(1)
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 373b3b4b4..bdc768c78 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):
'display_id': 'iseven',
'ext': 'flv',
'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- 'description': 'md5:c93d6692dde6fe33809a46edcbecca44',
+ 'description': 'md5:f34981259a03e980a3c6404190a3ed61',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': '7师傅',
'uploader_id': '431925',
@@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
}, {
'url': 'http://www.douyutv.com/85982',
'info_dict': {
@@ -42,7 +42,24 @@ class DouyuTVIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Romm not found',
+ }, {
+ 'url': 'http://www.douyutv.com/17732',
+ 'info_dict': {
+ 'id': '17732',
+ 'display_id': '17732',
+ 'ext': 'flv',
+ 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:f34981259a03e980a3c6404190a3ed61',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': '7师傅',
+ 'uploader_id': '431925',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 6cda56a7f..a638c827c 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -1,6 +1,8 @@
-# encoding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
+import json
+import re
import time
from .common import InfoExtractor
@@ -8,44 +10,125 @@ from ..utils import int_or_none
class DPlayIE(InfoExtractor):
- _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)'
+ _VALID_URL = r'http://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
- _TEST = {
+ _TESTS = [{
+ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/',
+ 'info_dict': {
+ 'id': '1255600',
+ 'display_id': 'stagione-1-episodio-25',
+ 'ext': 'mp4',
+ 'title': 'Episodio 25',
+ 'description': 'md5:cae5f40ad988811b197d2d27a53227eb',
+ 'duration': 2761,
+ 'timestamp': 1454701800,
+ 'upload_date': '20160205',
+ 'creator': 'RTIT',
+ 'series': 'Take me out',
+ 'season_number': 1,
+ 'episode_number': 25,
+ 'age_limit': 0,
+ },
+ 'expected_warnings': ['Unable to download f4m manifest'],
+ }, {
'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
'info_dict': {
'id': '3172',
- 'ext': 'mp4',
'display_id': 'season-1-svensken-lar-sig-njuta-av-livet',
+ 'ext': 'flv',
'title': 'Svensken lär sig njuta av livet',
+ 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8',
'duration': 2650,
+ 'timestamp': 1365454320,
+ 'upload_date': '20130408',
+ 'creator': 'Kanal 5 (Home)',
+ 'series': 'Nugammalt - 77 händelser som format Sverige',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'age_limit': 0,
},
- }
+ }, {
+ 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/',
+ 'info_dict': {
+ 'id': '70816',
+ 'display_id': 'season-6-episode-12',
+ 'ext': 'flv',
+ 'title': 'Episode 12',
+ 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90',
+ 'duration': 2563,
+ 'timestamp': 1429696800,
+ 'upload_date': '20150422',
+ 'creator': 'Kanal 4',
+ 'series': 'Mig og min mor',
+ 'season_number': 6,
+ 'episode_number': 12,
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ domain = mobj.group('domain')
+
webpage = self._download_webpage(url, display_id)
+
video_id = self._search_regex(
- r'data-video-id="(\d+)"', webpage, 'video id')
+ r'data-video-id=["\'](\d+)', webpage, 'video id')
info = self._download_json(
- 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id,
+ 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id),
video_id)['data'][0]
- self._set_cookie(
- 'secure.dplay.se', 'dsc-geo',
- '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000))
- # TODO: consider adding support for 'stream_type=hds', it seems to
- # require setting some cookies
- manifest_url = self._download_json(
- 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id,
- video_id, 'Getting manifest url for hls stream')['hls']
- formats = self._extract_m3u8_formats(
- manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native')
+ title = info['title']
+
+ PROTOCOLS = ('hls', 'hds')
+ formats = []
+
+ def extract_formats(protocol, manifest_url):
+ if protocol == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ manifest_url, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False))
+ elif protocol == 'hds':
+ formats.extend(self._extract_f4m_formats(
+ manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0',
+ video_id, f4m_id=protocol, fatal=False))
+
+ domain_tld = domain.split('.')[-1]
+ if domain_tld in ('se', 'dk'):
+ for protocol in PROTOCOLS:
+ self._set_cookie(
+ 'secure.dplay.%s' % domain_tld, 'dsc-geo',
+ json.dumps({
+ 'countryCode': domain_tld.upper(),
+ 'expiry': (time.time() + 20 * 60) * 1000,
+ }))
+ stream = self._download_json(
+ 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s'
+ % (domain_tld, video_id, protocol), video_id,
+ 'Downloading %s stream JSON' % protocol, fatal=False)
+ if stream and stream.get(protocol):
+ extract_formats(protocol, stream[protocol])
+ else:
+ for protocol in PROTOCOLS:
+ if info.get(protocol):
+ extract_formats(protocol, info[protocol])
return {
'id': video_id,
'display_id': display_id,
- 'title': info['title'],
- 'formats': formats,
+ 'title': title,
+ 'description': info.get('video_metadata_longDescription'),
'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
+ 'timestamp': int_or_none(info.get('video_publish_date')),
+ 'creator': info.get('video_metadata_homeChannel'),
+ 'series': info.get('video_metadata_show'),
+ 'season_number': int_or_none(info.get('season')),
+ 'episode_number': int_or_none(info.get('episode')),
+ 'age_limit': int_or_none(info.get('minimum_age')),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py
index 8b98b013a..01271f8f0 100644
--- a/youtube_dl/extractor/drbonanza.py
+++ b/youtube_dl/extractor/drbonanza.py
@@ -87,7 +87,7 @@ class DRBonanzaIE(InfoExtractor):
formats = []
for file in info['Files']:
- if info['Type'] == "Video":
+ if info['Type'] == 'Video':
if file['Type'] in video_types:
format = parse_filename_info(file['Location'])
format.update({
@@ -101,10 +101,10 @@ class DRBonanzaIE(InfoExtractor):
if '/bonanza/' in rtmp_url:
format['play_path'] = rtmp_url.split('/bonanza/')[1]
formats.append(format)
- elif file['Type'] == "Thumb":
+ elif file['Type'] == 'Thumb':
thumbnail = file['Location']
- elif info['Type'] == "Audio":
- if file['Type'] == "Audio":
+ elif info['Type'] == 'Audio':
+ if file['Type'] == 'Audio':
format = parse_filename_info(file['Location'])
format.update({
'url': file['Location'],
@@ -112,7 +112,7 @@ class DRBonanzaIE(InfoExtractor):
'vcodec': 'none',
})
formats.append(format)
- elif file['Type'] == "Thumb":
+ elif file['Type'] == 'Thumb':
thumbnail = file['Location']
description = '%s\n%s\n%s\n' % (
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py
new file mode 100644
index 000000000..b6c985547
--- /dev/null
+++ b/youtube_dl/extractor/dw.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+from ..compat import compat_urlparse
+
+
+class DWIE(InfoExtractor):
+ IE_NAME = 'dw'
+ _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+ _TESTS = [{
+ # video
+ 'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
+ 'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+ 'info_dict': {
+ 'id': '19112290',
+ 'ext': 'mp4',
+ 'title': 'Intelligent light',
+ 'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
+ 'upload_date': '20160311',
+ }
+ }, {
+ # audio
+ 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941',
+ 'md5': '2814c9a1321c3a51f8a7aeb067a360dd',
+ 'info_dict': {
+ 'id': '19111941',
+ 'ext': 'mp3',
+ 'title': 'WorldLink: My business',
+ 'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
+ 'upload_date': '20160311',
+ }
+ }]
+
+ def _real_extract(self, url):
+ media_id = self._match_id(url)
+ webpage = self._download_webpage(url, media_id)
+ hidden_inputs = self._hidden_inputs(webpage)
+ title = hidden_inputs['media_title']
+
+ formats = []
+ if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+ formats = self._extract_smil_formats(
+ 'http://www.dw.com/smil/v-%s' % media_id, media_id,
+ transform_source=lambda s: s.replace(
+ 'rtmp://tv-od.dw.de/flash/',
+ 'http://tv-download.dw.de/dwtv_video/flv/'))
+ else:
+ formats = [{'url': hidden_inputs['file_name']}]
+
+ return {
+ 'id': media_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': hidden_inputs.get('preview_image'),
+ 'duration': int_or_none(hidden_inputs.get('file_duration')),
+ 'upload_date': hidden_inputs.get('display_date'),
+ 'formats': formats,
+ }
+
+
+class DWArticleIE(InfoExtractor):
+ IE_NAME = 'dw:article'
+ _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
+ 'md5': '8ca657f9d068bbef74d6fc38b97fc869',
+ 'info_dict': {
+ 'id': '19105868',
+ 'ext': 'mp4',
+ 'title': 'The harsh life of refugees in Idomeni',
+ 'description': 'md5:196015cc7e48ebf474db9399420043c7',
+ 'upload_date': '20160310',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ hidden_inputs = self._hidden_inputs(webpage)
+ media_id = hidden_inputs['media_id']
+ media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
+ media_url = compat_urlparse.urljoin(url, media_path)
+ return self.url_result(media_url, 'DW', media_id)
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index 0b61ea0ba..9a44f89f3 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -17,85 +17,85 @@ class EightTracksIE(InfoExtractor):
IE_NAME = '8tracks'
_VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
_TEST = {
- "name": "EightTracks",
- "url": "http://8tracks.com/ytdl/youtube-dl-test-tracks-a",
- "info_dict": {
+ 'name': 'EightTracks',
+ 'url': 'http://8tracks.com/ytdl/youtube-dl-test-tracks-a',
+ 'info_dict': {
'id': '1336550',
'display_id': 'youtube-dl-test-tracks-a',
- "description": "test chars: \"'/\\ä↭",
- "title": "youtube-dl test tracks \"'/\\ä↭<>",
+ 'description': "test chars: \"'/\\ä↭",
+ 'title': "youtube-dl test tracks \"'/\\ä↭<>",
},
- "playlist": [
+ 'playlist': [
{
- "md5": "96ce57f24389fc8734ce47f4c1abcc55",
- "info_dict": {
- "id": "11885610",
- "ext": "m4a",
- "title": "youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': '96ce57f24389fc8734ce47f4c1abcc55',
+ 'info_dict': {
+ 'id': '11885610',
+ 'ext': 'm4a',
+ 'title': "youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "4ab26f05c1f7291ea460a3920be8021f",
- "info_dict": {
- "id": "11885608",
- "ext": "m4a",
- "title": "youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': '4ab26f05c1f7291ea460a3920be8021f',
+ 'info_dict': {
+ 'id': '11885608',
+ 'ext': 'm4a',
+ 'title': "youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "d30b5b5f74217410f4689605c35d1fd7",
- "info_dict": {
- "id": "11885679",
- "ext": "m4a",
- "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': 'd30b5b5f74217410f4689605c35d1fd7',
+ 'info_dict': {
+ 'id': '11885679',
+ 'ext': 'm4a',
+ 'title': "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "4eb0a669317cd725f6bbd336a29f923a",
- "info_dict": {
- "id": "11885680",
- "ext": "m4a",
- "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': '4eb0a669317cd725f6bbd336a29f923a',
+ 'info_dict': {
+ 'id': '11885680',
+ 'ext': 'm4a',
+ 'title': "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "1893e872e263a2705558d1d319ad19e8",
- "info_dict": {
- "id": "11885682",
- "ext": "m4a",
- "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': '1893e872e263a2705558d1d319ad19e8',
+ 'info_dict': {
+ 'id': '11885682',
+ 'ext': 'm4a',
+ 'title': "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "b673c46f47a216ab1741ae8836af5899",
- "info_dict": {
- "id": "11885683",
- "ext": "m4a",
- "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': 'b673c46f47a216ab1741ae8836af5899',
+ 'info_dict': {
+ 'id': '11885683',
+ 'ext': 'm4a',
+ 'title': "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "1d74534e95df54986da7f5abf7d842b7",
- "info_dict": {
- "id": "11885684",
- "ext": "m4a",
- "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': '1d74534e95df54986da7f5abf7d842b7',
+ 'info_dict': {
+ 'id': '11885684',
+ 'ext': 'm4a',
+ 'title': "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
},
{
- "md5": "f081f47af8f6ae782ed131d38b9cd1c0",
- "info_dict": {
- "id": "11885685",
- "ext": "m4a",
- "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
- "uploader_id": "ytdl"
+ 'md5': 'f081f47af8f6ae782ed131d38b9cd1c0',
+ 'info_dict': {
+ 'id': '11885685',
+ 'ext': 'm4a',
+ 'title': "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
+ 'uploader_id': 'ytdl'
}
}
]
diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py
index 476cce2d0..4c8190d68 100644
--- a/youtube_dl/extractor/ellentv.py
+++ b/youtube_dl/extractor/ellentv.py
@@ -72,7 +72,7 @@ class EllenTVClipsIE(InfoExtractor):
def _extract_playlist(self, webpage):
json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json')
try:
- return json.loads("[{" + json_string + "}]")
+ return json.loads('[{' + json_string + '}]')
except ValueError as ve:
raise ExtractorError('Failed to download JSON', cause=ve)
diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py
index 00a69e631..8c725a4e6 100644
--- a/youtube_dl/extractor/elpais.py
+++ b/youtube_dl/extractor/elpais.py
@@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESC = 'El País'
- _TEST = {
+ _TESTS = [{
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
'md5': '98406f301f19562170ec071b83433d55',
'info_dict': {
@@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor):
'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
'upload_date': '20140206',
}
- }
+ }, {
+ 'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t',
+ 'md5': '3bd5b09509f3519d7d9e763179b013de',
+ 'info_dict': {
+ 'id': '1456340311_668921',
+ 'ext': 'mp4',
+ 'title': 'Cómo hacer el mejor café con cafetera italiana',
+ 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.',
+ 'upload_date': '20160303',
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
- r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
+ r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')
video_suffix = self._search_regex(
- r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
+ r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')
video_url = prefix + video_suffix
thumbnail_suffix = self._search_regex(
- r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
- fatal=False)
+ r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'",
+ webpage, 'thumbnail URL', fatal=False)
thumbnail = (
None if thumbnail_suffix is None
else prefix + thumbnail_suffix)
title = self._html_search_regex(
- '<h2 class="entry-header entry-title.*?>(.*?)</h2>',
+ (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title',
+ r'<h2 class="entry-header entry-title.*?>(.*?)</h2>'),
webpage, 'title')
- date_str = self._search_regex(
+ upload_date = unified_strdate(self._search_regex(
r'<p class="date-header date-int updated"\s+title="([^"]+)">',
- webpage, 'upload date', fatal=False)
- upload_date = (None if date_str is None else unified_strdate(date_str))
+ webpage, 'upload date', default=None) or self._html_search_meta(
+ 'datePublished', webpage, 'timestamp'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index e4180701d..e5e57d485 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -1,21 +1,13 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- url_basename,
-)
class EngadgetIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://www.engadget.com/
- (?:video(?:/5min)?/(?P<id>\d+)|
- [\d/]+/.*?)
- '''
+ _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.engadget.com/video/5min/518153925/',
+ 'url': 'http://www.engadget.com/video/518153925/',
'md5': 'c6820d4828a5064447a4d9fc73f312c9',
'info_dict': {
'id': '518153925',
@@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
- if video_id is not None:
- return self.url_result('5min:%s' % video_id)
- else:
- title = url_basename(url)
- webpage = self._download_webpage(url, title)
- ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': [self.url_result('5min:%s' % vid) for vid in ids]
- }
+ return self.url_result('5min:%s' % video_id)
diff --git a/youtube_dl/extractor/everyonesmixtape.py b/youtube_dl/extractor/everyonesmixtape.py
index 493d38af8..84a9b750e 100644
--- a/youtube_dl/extractor/everyonesmixtape.py
+++ b/youtube_dl/extractor/everyonesmixtape.py
@@ -14,14 +14,14 @@ class EveryonesMixtapeIE(InfoExtractor):
_TESTS = [{
'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5',
- "info_dict": {
+ 'info_dict': {
'id': '5bfseWNmlds',
'ext': 'mp4',
- "title": "Passion Pit - \"Sleepyhead\" (Official Music Video)",
- "uploader": "FKR.TV",
- "uploader_id": "frenchkissrecords",
- "description": "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com",
- "upload_date": "20081015"
+ 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)",
+ 'uploader': 'FKR.TV',
+ 'uploader_id': 'frenchkissrecords',
+ 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com",
+ 'upload_date': '20081015'
},
'params': {
'skip_download': True, # This is simply YouTube
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py
index 4de02aee9..0c0fe6d65 100644
--- a/youtube_dl/extractor/exfm.py
+++ b/youtube_dl/extractor/exfm.py
@@ -41,7 +41,7 @@ class ExfmIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
song_id = mobj.group('id')
- info_url = "http://ex.fm/api/v3/song/%s" % song_id
+ info_url = 'http://ex.fm/api/v3/song/%s' % song_id
info = self._download_json(info_url, song_id)['song']
song_url = info['url']
if re.match(self._SOUNDCLOUD_URL, song_url) is not None:
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index ed237f081..f5bbd39d2 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -34,9 +34,12 @@ class FacebookIE(InfoExtractor):
video/video\.php|
photo\.php|
video\.php|
- video/embed
- )\?(?:.*?)(?:v|video_id)=|
- [^/]+/videos/(?:[^/]+/)?
+ video/embed|
+ story\.php
+ )\?(?:.*?)(?:v|video_id|story_fbid)=|
+ [^/]+/videos/(?:[^/]+/)?|
+ [^/]+/posts/|
+ groups/[^/]+/permalink/
)|
facebook:
)
@@ -49,6 +52,8 @@ class FacebookIE(InfoExtractor):
_CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
+ _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
+
_TESTS = [{
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
'md5': '6a40d33c0eccbb1af76cf0485a052659',
@@ -81,6 +86,33 @@ class FacebookIE(InfoExtractor):
'uploader': 'Demy de Zeeuw',
},
}, {
+ 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
+ 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
+ 'info_dict': {
+ 'id': '544765982287235',
+ 'ext': 'mp4',
+ 'title': '"What are you doing running in the snow?"',
+ 'uploader': 'FailArmy',
+ }
+ }, {
+ 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
+ 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
+ 'info_dict': {
+ 'id': '1035862816472149',
+ 'ext': 'mp4',
+ 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog',
+ 'uploader': 'S. Saint',
+ },
+ }, {
+ 'note': 'swf params escaped',
+ 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
+ 'md5': '97ba073838964d12c70566e0085c2b91',
+ 'info_dict': {
+ 'id': '10153664894881749',
+ 'ext': 'mp4',
+ 'title': 'Facebook video #10153664894881749',
+ },
+ }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
}, {
@@ -92,6 +124,9 @@ class FacebookIE(InfoExtractor):
}, {
'url': 'facebook:544765982287235',
'only_matching': True,
+ }, {
+ 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
+ 'only_matching': True,
}]
def _login(self):
@@ -160,19 +195,19 @@ class FacebookIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _real_extract(self, url):
- video_id = self._match_id(url)
- req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
+ def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
+ req = sanitized_Request(url)
req.add_header('User-Agent', self._CHROME_USER_AGENT)
webpage = self._download_webpage(req, video_id)
video_data = None
- BEFORE = '{swf.addParam(param[0], param[1]);});\n'
+ BEFORE = '{swf.addParam(param[0], param[1]);});'
AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
- m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
+ m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
if m:
- data = dict(json.loads(m.group(1)))
+ swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+ data = dict(json.loads(swf_params))
params_raw = compat_urllib_parse_unquote(data['params'])
video_data = json.loads(params_raw)['video_data']
@@ -185,13 +220,15 @@ class FacebookIE(InfoExtractor):
if not video_data:
server_js_data = self._parse_json(self._search_regex(
- r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id)
- for item in server_js_data['instances']:
+ r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id)
+ for item in server_js_data.get('instances', []):
if item[1][0] == 'VideoConfig':
video_data = video_data_list2dict(item[2][0]['videoData'])
break
if not video_data:
+ if not fatal_if_no_video:
+ return webpage, False
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
@@ -208,10 +245,13 @@ class FacebookIE(InfoExtractor):
for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type))
if src:
+ preference = -10 if format_id == 'progressive' else 0
+ if quality == 'hd':
+ preference += 5
formats.append({
'format_id': '%s_%s_%s' % (format_id, quality, src_type),
'url': src,
- 'preference': -10 if format_id == 'progressive' else 0,
+ 'preference': preference,
})
dash_manifest = f[0].get('dash_manifest')
if dash_manifest:
@@ -234,39 +274,36 @@ class FacebookIE(InfoExtractor):
video_title = 'Facebook video #%s' % video_id
uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
- return {
+ info_dict = {
'id': video_id,
'title': video_title,
'formats': formats,
'uploader': uploader,
}
-
-class FacebookPostIE(InfoExtractor):
- IE_NAME = 'facebook:post'
- _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
- 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
- 'info_dict': {
- 'id': '544765982287235',
- 'ext': 'mp4',
- 'title': '"What are you doing running in the snow?"',
- 'uploader': 'FailArmy',
- }
- }
+ return webpage, info_dict
def _real_extract(self, url):
- post_id = self._match_id(url)
+ video_id = self._match_id(url)
+
+ real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
+ webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
- webpage = self._download_webpage(url, post_id)
+ if info_dict:
+ return info_dict
- entries = [
- self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
- for video_id in self._parse_json(
- self._search_regex(
- r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
- webpage, 'video ids', group='ids'),
- post_id)]
+ if '/posts/' in url:
+ entries = [
+ self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
+ for vid in self._parse_json(
+ self._search_regex(
+ r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
+ webpage, 'video ids', group='ids'),
+ video_id)]
- return self.playlist_result(entries, post_id)
+ return self.playlist_result(entries, video_id)
+ else:
+ _, info_dict = self._extract_from_url(
+ self._VIDEO_PAGE_TEMPLATE % video_id,
+ video_id, fatal_if_no_video=True)
+ return info_dict
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index 6f9b003c2..fd535457d 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -52,7 +52,7 @@ class FazIE(InfoExtractor):
formats = []
for pref, code in enumerate(['LOW', 'HIGH', 'HQ']):
encoding = xpath_element(encodings, code)
- if encoding:
+ if encoding is not None:
encoding_url = xpath_text(encoding, 'FILENAME')
if encoding_url:
formats.append({
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
index 4c81271d3..9580f5c0c 100644
--- a/youtube_dl/extractor/fc2.py
+++ b/youtube_dl/extractor/fc2.py
@@ -87,7 +87,7 @@ class FC2IE(InfoExtractor):
mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
info_url = (
- "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
+ 'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'.
format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
info_webpage = self._download_webpage(
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 2955965d9..67d50a386 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -1,5 +1,7 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
@@ -16,12 +18,7 @@ from ..utils import (
class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
- _VALID_URL = r'''(?x)
- (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
- https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
- 5min:)
- (?P<id>\d+)
- '''
+ _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'
_TESTS = [
{
@@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor):
'title': 'How to Make a Next-Level Fruit Salad',
'duration': 184,
},
+ 'skip': 'no longer available',
},
]
_ERRORS = {
@@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ sid = mobj.group('sid')
+
+ if mobj.group('query'):
+ qs = compat_parse_qs(mobj.group('query'))
+ if not qs.get('playList'):
+ raise ExtractorError('Invalid URL', expected=True)
+ video_id = qs['playList'][0]
+ if qs.get('sid'):
+ sid = qs['sid'][0]
+
embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
- embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed page')
- sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
- query = compat_urllib_parse.urlencode({
- 'func': 'GetResults',
- 'playlist': video_id,
- 'sid': sid,
- 'isPlayerSeed': 'true',
- 'url': embed_url,
- })
+ if not sid:
+ embed_page = self._download_webpage(embed_url, video_id,
+ 'Downloading embed page')
+ sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
+
response = self._download_json(
- 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
+ 'https://syn.5min.com/handlers/SenseHandler.ashx?' +
+ compat_urllib_parse.urlencode({
+ 'func': 'GetResults',
+ 'playlist': video_id,
+ 'sid': sid,
+ 'isPlayerSeed': 'true',
+ 'url': embed_url,
+ }),
video_id)
if not response['success']:
raise ExtractorError(
@@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor):
parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
for rendition in info['Renditions']:
- if rendition['RenditionType'] == 'm3u8':
- formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
- elif rendition['RenditionType'] == 'aac':
+ if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':
continue
else:
rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
index 318ac013d..1dc50318c 100644
--- a/youtube_dl/extractor/foxnews.py
+++ b/youtube_dl/extractor/foxnews.py
@@ -36,6 +36,10 @@ class FoxNewsIE(AMPIE):
# 'upload_date': '20141204',
'thumbnail': 're:^https?://.*\.jpg$',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py
index fdc51f44f..0388ba00c 100644
--- a/youtube_dl/extractor/franceinter.py
+++ b/youtube_dl/extractor/franceinter.py
@@ -10,7 +10,7 @@ class FranceInterIE(InfoExtractor):
_TEST = {
'url': 'http://www.franceinter.fr/player/reecouter?play=793962',
'md5': '4764932e466e6f6c79c317d2e74f6884',
- "info_dict": {
+ 'info_dict': {
'id': '793962',
'ext': 'mp3',
'title': 'L’Histoire dans les jeux vidéo',
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 8e60cf60f..3f4ac3093 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -289,7 +289,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_id, catalogue = self._html_search_regex(
- r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
+ r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
webpage, 'video ID').split('@')
return self._extract_video(video_id, catalogue)
diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py
index c210177f7..1477708bb 100644
--- a/youtube_dl/extractor/freespeech.py
+++ b/youtube_dl/extractor/freespeech.py
@@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor):
'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0',
'info_dict': {
'id': 'poKsVCZ64uU',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Obama, Romney Campaign in Colorado Ahead of Debate',
'description': 'Obama, Romney Campaign in Colorado Ahead of Debate',
'uploader': 'freespeechtv',
diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py
index f755e3c4a..c7bec027b 100644
--- a/youtube_dl/extractor/freevideo.py
+++ b/youtube_dl/extractor/freevideo.py
@@ -12,8 +12,8 @@ class FreeVideoIE(InfoExtractor):
'info_dict': {
'id': 'vysukany-zadecek-22033',
'ext': 'mp4',
- "title": "vysukany-zadecek-22033",
- "age_limit": 18,
+ 'title': 'vysukany-zadecek-22033',
+ 'age_limit': 18,
},
'skip': 'Blocked outside .cz',
}
diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py
index 25870c131..a66e309de 100644
--- a/youtube_dl/extractor/gameinformer.py
+++ b/youtube_dl/extractor/gameinformer.py
@@ -2,42 +2,27 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import int_or_none
class GameInformerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P<id>.+)\.aspx'
_TEST = {
'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx',
+ 'md5': '292f26da1ab4beb4c9099f1304d2b071',
'info_dict': {
'id': '4515472681001',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': 'Replay - Animal Crossing',
'description': 'md5:2e211891b215c85d061adc7a4dd2d930',
- 'timestamp': 1443457610706,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'timestamp': 1443457610,
+ 'upload_date': '20150928',
+ 'uploader_id': '694940074001',
},
}
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
-
- bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url')
- json_data = self._download_json(
- bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions',
- display_id)
-
- return {
- 'id': compat_str(json_data['id']),
- 'display_id': display_id,
- 'url': json_data['IOSRenditions'][0]['url'],
- 'title': json_data['name'],
- 'description': json_data.get('shortDescription'),
- 'timestamp': int_or_none(json_data.get('publishedDate')),
- 'duration': int_or_none(json_data.get('length')),
- }
+ brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id')
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index bf61ab2e7..8121f04a5 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE
from .svt import SVTIE
from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
+from .tnaflix import TNAFlixNetworkEmbedIE
from .vimeo import VimeoIE
from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
@@ -224,6 +225,20 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # MPD from http://dash-mse-test.appspot.com/media.html
+ {
+ 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd',
+ 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53',
+ 'info_dict': {
+ 'id': 'car-20120827-manifest',
+ 'ext': 'mp4',
+ 'title': 'car-20120827-manifest',
+ 'formats': 'mincount:9',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ },
# google redirect
{
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@ -1227,28 +1242,34 @@ class GenericIE(InfoExtractor):
full_response = self._request_webpage(request, video_id)
head_response = full_response
+ info_dict = {
+ 'id': video_id,
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ }
+
# Check for direct link to a video
content_type = head_response.headers.get('Content-Type', '')
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
if m:
upload_date = unified_strdate(
head_response.headers.get('Last-Modified'))
- formats = []
- if m.group('format_id').endswith('mpegurl'):
+ format_id = m.group('format_id')
+ if format_id.endswith('mpegurl'):
formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+ elif format_id == 'f4m':
+ formats = self._extract_f4m_formats(url, video_id)
else:
formats = [{
'format_id': m.group('format_id'),
'url': url,
'vcodec': 'none' if m.group('type') == 'audio' else None
}]
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ info_dict.update({
'direct': True,
'formats': formats,
'upload_date': upload_date,
- }
+ })
+ return info_dict
if not self._downloader.params.get('test', False) and not is_intentional:
force = self._downloader.params.get('force_generic_extractor', False)
@@ -1276,13 +1297,12 @@ class GenericIE(InfoExtractor):
'URL could be a direct video link, returning it as such.')
upload_date = unified_strdate(
head_response.headers.get('Last-Modified'))
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ info_dict.update({
'direct': True,
'url': url,
'upload_date': upload_date,
- }
+ })
+ return info_dict
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
@@ -1299,11 +1319,12 @@ class GenericIE(InfoExtractor):
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
- 'formats': self._parse_mpd_formats(doc, video_id),
- }
+ info_dict['formats'] = self._parse_mpd_formats(
+ doc, video_id, mpd_base_url=url.rpartition('/')[0])
+ return info_dict
+ elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+ info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+ return info_dict
except compat_xml_parse_error:
pass
@@ -1413,7 +1434,7 @@ class GenericIE(InfoExtractor):
# Look for embedded Dailymotion player
matches = re.findall(
- r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+ r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
if matches:
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
@@ -1558,6 +1579,11 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'VK')
+ # Look for embedded Odnoklassniki player
+ mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Odnoklassniki')
+
# Look for embedded ivi player
mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
if mobj is not None:
@@ -1613,6 +1639,11 @@ class GenericIE(InfoExtractor):
if xhamster_urls:
return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+ # Look for embedded TNAFlixNetwork player
+ tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
+ if tnaflix_urls:
+ return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+
# Look for embedded Tvigle player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1959,6 +1990,8 @@ class GenericIE(InfoExtractor):
entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
elif ext == 'mpd':
entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+ elif ext == 'f4m':
+ entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
else:
entry_info_dict['url'] = video_url
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py
index b241c4868..3de8356f6 100644
--- a/youtube_dl/extractor/globo.py
+++ b/youtube_dl/extractor/globo.py
@@ -65,7 +65,7 @@ class GloboIE(InfoExtractor):
'only_matching': True,
}]
- class MD5:
+ class MD5(object):
HEX_FORMAT_LOWERCASE = 0
HEX_FORMAT_UPPERCASE = 1
BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = ''
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py
index f354c9c7a..766fc26d0 100644
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -10,8 +10,8 @@ from ..utils import (
class GoogleDriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})'
- _TEST = {
+ _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+ _TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
'md5': '881f7700aec4f538571fa1e0eed4a7b6',
'info_dict': {
@@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):
'title': 'Big Buck Bunny.mp4',
'duration': 46,
}
- }
+ }, {
+ # video id is longer than 28 characters
+ 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+ 'only_matching': True,
+ }]
_FORMATS_EXT = {
'5': 'flv',
'6': 'flv',
@@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
+ r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
webpage)
if mobj:
return 'https://drive.google.com/file/d/%s' % mobj.group('id')
@@ -82,7 +86,7 @@ class GoogleDriveIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
'duration': duration,
'formats': formats,
}
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py
index f5aa73d18..86a93de4d 100644
--- a/youtube_dl/extractor/hentaistigma.py
+++ b/youtube_dl/extractor/hentaistigma.py
@@ -11,8 +11,8 @@ class HentaiStigmaIE(InfoExtractor):
'info_dict': {
'id': 'inyouchuu-etsu-bonus',
'ext': 'mp4',
- "title": "Inyouchuu Etsu Bonus",
- "age_limit": 18,
+ 'title': 'Inyouchuu Etsu Bonus',
+ 'age_limit': 18,
}
}
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 02e1e428e..b61b2dc4e 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -42,7 +42,7 @@ class ImdbIE(InfoExtractor):
for f_url, f_name in extra_formats]
format_pages.append(player_page)
- quality = qualities(['SD', '480p', '720p'])
+ quality = qualities(('SD', '480p', '720p', '1080p'))
formats = []
for format_page in format_pages:
json_data = self._search_regex(
diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py
index 12fb5e8e1..9622f198a 100644
--- a/youtube_dl/extractor/indavideo.py
+++ b/youtube_dl/extractor/indavideo.py
@@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor):
'url': self._proto_relative_url(thumbnail)
} for thumbnail in video.get('thumbnails', [])]
- tags = [tag['title'] for tag in video.get('tags', [])]
+ tags = [tag['title'] for tag in video.get('tags') or []]
return {
'id': video.get('id') or video_id,
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 016af2084..cca0b8a93 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -4,15 +4,12 @@ from __future__ import unicode_literals
import base64
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_parse_qs,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import determine_ext
+from .bokecc import BokeCCBaseIE
-class InfoQIE(InfoExtractor):
+class InfoQIE(BokeCCBaseIE):
_VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
@@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor):
},
}]
- def _extract_bokecc_videos(self, webpage, video_id):
- # TODO: bokecc.com is a Chinese video cloud platform
- # It should have an independent extractor but I don't have other
- # examples using bokecc
- player_params_str = self._html_search_regex(
- r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
- webpage, 'player params', default=None)
-
- player_params = compat_parse_qs(player_params_str)
-
- info_xml = self._download_xml(
- 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
- player_params['siteid'][0], player_params['vid'][0]), video_id)
-
- return [{
- 'format_id': 'bokecc',
- 'url': quality.find('./copy').attrib['playurl'],
- 'preference': int(quality.attrib['value']),
- } for quality in info_xml.findall('./video/quality')]
-
def _extract_rtmp_videos(self, webpage):
# The server URL is hardcoded
video_url = 'rtmpe://video.infoq.com/cfx/st/'
@@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor):
if '/cn/' in url:
# for China videos, HTTP video URL exists but always fails with 403
- formats = self._extract_bokecc_videos(webpage, video_id)
+ formats = self._extract_bokecc_formats(webpage, video_id)
else:
formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage)
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index 691cb66d6..e7c0cb3f6 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -2,14 +2,163 @@
from __future__ import unicode_literals
import hashlib
+import itertools
import math
+import os
import random
+import re
import time
import uuid
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
-from ..utils import ExtractorError
+from ..compat import (
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+)
+from ..utils import (
+ decode_packed_codes,
+ ExtractorError,
+ ohdave_rsa_encrypt,
+ remove_start,
+ sanitized_Request,
+ urlencode_postdata,
+ url_basename,
+)
+
+
+def md5_text(text):
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+
+class IqiyiSDK(object):
+ def __init__(self, target, ip, timestamp):
+ self.target = target
+ self.ip = ip
+ self.timestamp = timestamp
+
+ @staticmethod
+ def split_sum(data):
+ return compat_str(sum(map(lambda p: int(p, 16), list(data))))
+
+ @staticmethod
+ def digit_sum(num):
+ if isinstance(num, int):
+ num = compat_str(num)
+ return compat_str(sum(map(int, num)))
+
+ def even_odd(self):
+ even = self.digit_sum(compat_str(self.timestamp)[::2])
+ odd = self.digit_sum(compat_str(self.timestamp)[1::2])
+ return even, odd
+
+ def preprocess(self, chunksize):
+ self.target = md5_text(self.target)
+ chunks = []
+ for i in range(32 // chunksize):
+ chunks.append(self.target[chunksize * i:chunksize * (i + 1)])
+ if 32 % chunksize:
+ chunks.append(self.target[32 - 32 % chunksize:])
+ return chunks, list(map(int, self.ip.split('.')))
+
+ def mod(self, modulus):
+ chunks, ip = self.preprocess(32)
+ self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip))
+
+ def split(self, chunksize):
+ modulus_map = {
+ 4: 256,
+ 5: 10,
+ 8: 100,
+ }
+
+ chunks, ip = self.preprocess(chunksize)
+ ret = ''
+ for i in range(len(chunks)):
+ ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else ''
+ if chunksize == 8:
+ ret += ip_part + chunks[i]
+ else:
+ ret += chunks[i] + ip_part
+ self.target = ret
+
+ def handle_input16(self):
+ self.target = md5_text(self.target)
+ self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:])
+
+ def handle_input8(self):
+ self.target = md5_text(self.target)
+ ret = ''
+ for i in range(4):
+ part = self.target[8 * i:8 * (i + 1)]
+ ret += self.split_sum(part) + part
+ self.target = ret
+
+ def handleSum(self):
+ self.target = md5_text(self.target)
+ self.target = self.split_sum(self.target) + self.target
+
+ def date(self, scheme):
+ self.target = md5_text(self.target)
+ d = time.localtime(self.timestamp)
+ strings = {
+ 'y': compat_str(d.tm_year),
+ 'm': '%02d' % d.tm_mon,
+ 'd': '%02d' % d.tm_mday,
+ }
+ self.target += ''.join(map(lambda c: strings[c], list(scheme)))
+
+ def split_time_even_odd(self):
+ even, odd = self.even_odd()
+ self.target = odd + md5_text(self.target) + even
+
+ def split_time_odd_even(self):
+ even, odd = self.even_odd()
+ self.target = even + md5_text(self.target) + odd
+
+ def split_ip_time_sum(self):
+ chunks, ip = self.preprocess(32)
+ self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp)
+
+ def split_time_ip_sum(self):
+ chunks, ip = self.preprocess(32)
+ self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip))
+
+
+class IqiyiSDKInterpreter(object):
+ def __init__(self, sdk_code):
+ self.sdk_code = sdk_code
+
+ def run(self, target, ip, timestamp):
+ self.sdk_code = decode_packed_codes(self.sdk_code)
+
+ functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
+
+ sdk = IqiyiSDK(target, ip, timestamp)
+
+ other_functions = {
+ 'handleSum': sdk.handleSum,
+ 'handleInput8': sdk.handle_input8,
+ 'handleInput16': sdk.handle_input16,
+ 'splitTimeEvenOdd': sdk.split_time_even_odd,
+ 'splitTimeOddEven': sdk.split_time_odd_even,
+ 'splitIpTimeSum': sdk.split_ip_time_sum,
+ 'splitTimeIpSum': sdk.split_time_ip_sum,
+ }
+ for function in functions:
+ if re.match(r'mod\d+', function):
+ sdk.mod(int(function[3:]))
+ elif re.match(r'date[ymd]{3}', function):
+ sdk.date(function[4:])
+ elif re.match(r'split\d+', function):
+ sdk.split(int(function[5:]))
+ elif function in other_functions:
+ other_functions[function]()
+ else:
+ raise ExtractorError('Unknown funcion %s' % function)
+
+ return sdk.target
class IqiyiIE(InfoExtractor):
@@ -18,6 +167,8 @@ class IqiyiIE(InfoExtractor):
_VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
+ _NETRC_MACHINE = 'iqiyi'
+
_TESTS = [{
'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
'md5': '2cb594dc2781e6c941a110d8f358118b',
@@ -93,6 +244,35 @@ class IqiyiIE(InfoExtractor):
}, {
'url': 'http://yule.iqiyi.com/pcb.html',
'only_matching': True,
+ }, {
+ # VIP-only video. The first 2 parts (6 minutes) are available without login
+ # MD5 sums omitted as values are different on Travis CI and my machine
+ 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
+ 'info_dict': {
+ 'id': 'f3cf468b39dddb30d676f89a91200dc1',
+ 'title': '泰坦尼克号',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
+ 'ext': 'f4v',
+ 'title': '泰坦尼克号',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
+ 'ext': 'f4v',
+ 'title': '泰坦尼克号',
+ },
+ }],
+ 'expected_warnings': ['Needs a VIP account for full video'],
+ }, {
+ 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+ 'info_dict': {
+ 'id': '202918101',
+ 'title': '灌篮高手 国语版',
+ },
+ 'playlist_count': 101,
}]
_FORMATS_MAP = [
@@ -104,11 +284,98 @@ class IqiyiIE(InfoExtractor):
('10', 'h1'),
]
+ def _real_initialize(self):
+ self._login()
+
@staticmethod
- def md5_text(text):
- return hashlib.md5(text.encode('utf-8')).hexdigest()
+ def _rsa_fun(data):
+ # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
+ N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
+ e = 65537
+
+ return ohdave_rsa_encrypt(data, e, N)
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+
+ # No authentication to be performed
+ if not username:
+ return True
+
+ data = self._download_json(
+ 'http://kylin.iqiyi.com/get_token', None,
+ note='Get token for logging', errnote='Unable to get token for logging')
+ sdk = data['sdk']
+ timestamp = int(time.time())
+ target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % (
+ username, self._rsa_fun(password.encode('utf-8')))
+
+ interp = IqiyiSDKInterpreter(sdk)
+ sign = interp.run(target, data['ip'], timestamp)
+
+ validation_params = {
+ 'target': target,
+ 'server': 'BEA3AA1908656AABCCFF76582C4C6660',
+ 'token': data['token'],
+ 'bird_src': 'f8d91d57af224da7893dd397d52d811a',
+ 'sign': sign,
+ 'bird_t': timestamp,
+ }
+ validation_result = self._download_json(
+ 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+ note='Validate credentials', errnote='Unable to validate credentials')
- def construct_video_urls(self, data, video_id, _uuid):
+ MSG_MAP = {
+ 'P00107': 'please login via the web interface and enter the CAPTCHA code',
+ 'P00117': 'bad username or password',
+ }
+
+ code = validation_result['code']
+ if code != 'A00000':
+ msg = MSG_MAP.get(code)
+ if not msg:
+ msg = 'error %s' % code
+ if validation_result.get('msg'):
+ msg += ': ' + validation_result['msg']
+ self._downloader.report_warning('unable to log in: ' + msg)
+ return False
+
+ return True
+
+ def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
+ auth_params = {
+ # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
+ 'version': '2.0',
+ 'platform': 'b6c13e26323c537d',
+ 'aid': tvid,
+ 'tvid': tvid,
+ 'uid': '',
+ 'deviceId': _uuid,
+ 'playType': 'main', # XXX: always main?
+ 'filename': os.path.splitext(url_basename(api_video_url))[0],
+ }
+
+ qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
+ for key, val in qd_items.items():
+ auth_params[key] = val[0]
+
+ auth_req = sanitized_Request(
+ 'http://api.vip.iqiyi.com/services/ckn.action',
+ urlencode_postdata(auth_params))
+ # iQiyi server throws HTTP 405 error without the following header
+ auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ auth_result = self._download_json(
+ auth_req, video_id,
+ note='Downloading video authentication JSON',
+ errnote='Unable to download video authentication JSON')
+ if auth_result['code'] == 'Q00506': # requires a VIP account
+ if do_report_warning:
+ self.report_warning('Needs a VIP account for full video')
+ return False
+
+ return auth_result
+
+ def construct_video_urls(self, data, video_id, _uuid, tvid):
def do_xor(x, y):
a = y % 3
if a == 1:
@@ -134,9 +401,10 @@ class IqiyiIE(InfoExtractor):
note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
)['t']
t = str(int(math.floor(int(tm) / (600.0))))
- return self.md5_text(t + mg + x)
+ return md5_text(t + mg + x)
video_urls_dict = {}
+ need_vip_warning_report = True
for format_item in data['vp']['tkl'][0]['vs']:
if 0 < int(format_item['bid']) <= 10:
format_id = self.get_format(format_item['bid'])
@@ -155,11 +423,13 @@ class IqiyiIE(InfoExtractor):
vl = segment['l']
if not vl.startswith('/'):
vl = get_encode_code(vl)
- key = get_path_key(
- vl.split('/')[-1].split('.')[0], format_id, segment_index)
+ is_vip_video = '/vip/' in vl
filesize = segment['b']
base_url = data['vp']['du'].split('/')
- base_url.insert(-1, key)
+ if not is_vip_video:
+ key = get_path_key(
+ vl.split('/')[-1].split('.')[0], format_id, segment_index)
+ base_url.insert(-1, key)
base_url = '/'.join(base_url)
param = {
'su': _uuid,
@@ -170,8 +440,23 @@ class IqiyiIE(InfoExtractor):
'ct': '',
'tn': str(int(time.time()))
}
- api_video_url = base_url + vl + '?' + \
- compat_urllib_parse.urlencode(param)
+ api_video_url = base_url + vl
+ if is_vip_video:
+ api_video_url = api_video_url.replace('.f4v', '.hml')
+ auth_result = self._authenticate_vip_video(
+ api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
+ if auth_result is False:
+ need_vip_warning_report = False
+ break
+ param.update({
+ 't': auth_result['data']['t'],
+ # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
+ 'cid': 'afbe8fd3d73448c9',
+ 'vid': video_id,
+ 'QY00001': auth_result['data']['u'],
+ })
+ api_video_url += '?' if '?' not in api_video_url else '&'
+ api_video_url += compat_urllib_parse.urlencode(param)
js = self._download_json(
api_video_url, video_id,
note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
@@ -195,16 +480,17 @@ class IqiyiIE(InfoExtractor):
tail = tm + tvid
param = {
'key': 'fvip',
- 'src': self.md5_text('youtube-dl'),
+ 'src': md5_text('youtube-dl'),
'tvId': tvid,
'vid': video_id,
'vinfo': 1,
'tm': tm,
- 'enc': self.md5_text(enc_key + tail),
+ 'enc': md5_text(enc_key + tail),
'qyid': _uuid,
'tn': random.random(),
'um': 0,
- 'authkey': self.md5_text(self.md5_text('') + tail),
+ 'authkey': md5_text(md5_text('') + tail),
+ 'k_tag': 1,
}
api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
@@ -212,40 +498,75 @@ class IqiyiIE(InfoExtractor):
raw_data = self._download_json(api_url, video_id)
return raw_data
- def get_enc_key(self, swf_url, video_id):
+ def get_enc_key(self, video_id):
# TODO: automatic key extraction
# last update at 2016-01-22 for Zombie::bite
- enc_key = '6ab6d0280511493ba85594779759d4ed'
+ enc_key = '8ed797d224d043e7ac23d95b70227d32'
return enc_key
+ def _extract_playlist(self, webpage):
+ PAGE_SIZE = 50
+
+ links = re.findall(
+ r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+ webpage)
+ if not links:
+ return
+
+ album_id = self._search_regex(
+ r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+ album_title = self._search_regex(
+ r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+ entries = list(map(self.url_result, links))
+
+ # Start from 2 because links in the first page are already on webpage
+ for page_num in itertools.count(2):
+ pagelist_page = self._download_webpage(
+ 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+ album_id,
+ note='Download playlist page %d' % page_num,
+ errnote='Failed to download playlist page %d' % page_num)
+ pagelist = self._parse_json(
+ remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+ vlist = pagelist['data']['vlist']
+ for item in vlist:
+ entries.append(self.url_result(item['vurl']))
+ if len(vlist) < PAGE_SIZE:
+ break
+
+ return self.playlist_result(entries, album_id, album_title)
+
def _real_extract(self, url):
webpage = self._download_webpage(
url, 'temp_id', note='download video page')
+
+ # There's no simple way to determine whether an URL is a playlist or not
+ # So detect it
+ playlist_result = self._extract_playlist(webpage)
+ if playlist_result:
+ return playlist_result
+
tvid = self._search_regex(
r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
video_id = self._search_regex(
r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
- swf_url = self._search_regex(
- r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
_uuid = uuid.uuid4().hex
- enc_key = self.get_enc_key(swf_url, video_id)
+ enc_key = self.get_enc_key(video_id)
raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
if raw_data['code'] != 'A000000':
raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
- if not raw_data['data']['vp']['tkl']:
- raise ExtractorError('No support iQiqy VIP video')
-
data = raw_data['data']
title = data['vi']['vn']
# generate video_urls_dict
video_urls_dict = self.construct_video_urls(
- data, video_id, _uuid)
+ data, video_id, _uuid, tvid)
# construct info
entries = []
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index eef7daa29..137db873c 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor):
webpage = self._download_webpage(url, title)
title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
config_url = self._html_search_regex(
- r'data-src="(/contenu/medias/video.php.*?)"',
+ r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"',
webpage, 'config URL')
config_url = 'http://www.jeuxvideo.com' + config_url
diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py
index 8e90d5986..6770685d7 100644
--- a/youtube_dl/extractor/jwplatform.py
+++ b/youtube_dl/extractor/jwplatform.py
@@ -7,33 +7,9 @@ from .common import InfoExtractor
from ..utils import int_or_none
-class JWPlatformIE(InfoExtractor):
- _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
- _TEST = {
- 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
- 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
- 'info_dict': {
- 'id': 'nPripu9l',
- 'ext': 'mov',
- 'title': 'Big Buck Bunny Trailer',
- 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
- 'upload_date': '20081127',
- 'timestamp': 1227796140,
- }
- }
-
- @staticmethod
- def _extract_url(webpage):
- mobj = re.search(
- r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
- webpage)
- if mobj:
- return mobj.group('url')
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
- video_data = json_data['playlist'][0]
+class JWPlatformBaseIE(InfoExtractor):
+ def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True):
+ video_data = jwplayer_data['playlist'][0]
subtitles = {}
for track in video_data['tracks']:
if track['kind'] == 'captions':
@@ -43,7 +19,7 @@ class JWPlatformIE(InfoExtractor):
for source in video_data['sources']:
source_url = self._proto_relative_url(source['file'])
source_type = source.get('type') or ''
- if source_type == 'application/vnd.apple.mpegurl':
+ if source_type in ('application/vnd.apple.mpegurl', 'hls'):
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', 'm3u8_native', fatal=False))
elif source_type.startswith('audio'):
@@ -61,10 +37,39 @@ class JWPlatformIE(InfoExtractor):
return {
'id': video_id,
- 'title': video_data['title'],
+ 'title': video_data['title'] if require_title else video_data.get('title'),
'description': video_data.get('description'),
'thumbnail': self._proto_relative_url(video_data.get('image')),
'timestamp': int_or_none(video_data.get('pubdate')),
'subtitles': subtitles,
'formats': formats,
}
+
+
+class JWPlatformIE(JWPlatformBaseIE):
+ _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+ _TEST = {
+ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
+ 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
+ 'info_dict': {
+ 'id': 'nPripu9l',
+ 'ext': 'mov',
+ 'title': 'Big Buck Bunny Trailer',
+ 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
+ 'upload_date': '20081127',
+ 'timestamp': 1227796140,
+ }
+ }
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
+ return self._parse_jwplayer_data(json_data, video_id)
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index ccbc39c66..44d7c84a1 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urlparse,
+ compat_parse_qs,
)
from ..utils import (
clean_html,
@@ -20,21 +21,17 @@ from ..utils import (
class KalturaIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
- kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+ kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
(?:
(?:
# flash player
- index\.php/kwidget/
- (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
- (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+ index\.php/kwidget|
# html5 player
- html5/html5lib/
- (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
- .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+ html5/html5lib/[^/]+/mwEmbedFrame\.php
)
- )
+ )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
)
'''
_API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
@@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
- partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
- entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
-
- info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ partner_id, entry_id = mobj.group('partner_id', 'id')
+ ks = None
+ if partner_id and entry_id:
+ info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ else:
+ path, query = mobj.group('path', 'query')
+ if not path and not query:
+ raise ExtractorError('Invalid URL', expected=True)
+ params = {}
+ if query:
+ params = compat_parse_qs(query)
+ if path:
+ splitted_path = path.split('/')
+ params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
+ if 'wid' in params:
+ partner_id = params['wid'][0][1:]
+ elif 'p' in params:
+ partner_id = params['p'][0]
+ else:
+ raise ExtractorError('Invalid URL', expected=True)
+ if 'entry_id' in params:
+ entry_id = params['entry_id'][0]
+ info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
+ reference_id = params['flashvars[referenceId]'][0]
+ webpage = self._download_webpage(url, reference_id)
+ entry_data = self._parse_json(self._search_regex(
+ r'window\.kalturaIframePackageData\s*=\s*({.*});',
+ webpage, 'kalturaIframePackageData'),
+ reference_id)['entryResult']
+ info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
+ entry_id = info['id']
+ else:
+ raise ExtractorError('Invalid URL', expected=True)
+ ks = params.get('flashvars[ks]', [None])[0]
source_url = smuggled_data.get('source_url')
if source_url:
@@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):
else:
referrer = None
+ def sign_url(unsigned_url):
+ if ks:
+ unsigned_url += '/ks/%s' % ks
+ if referrer:
+ unsigned_url += '?referrer=%s' % referrer
+ return unsigned_url
+
formats = []
for f in flavor_assets:
# Continue if asset is not ready
if f['status'] != 2:
continue
- video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
- if referrer:
- video_url += '?referrer=%s' % referrer
+ video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
formats.append({
'format_id': '%(fileExt)s-%(bitrate)s' % f,
'ext': f.get('fileExt'),
@@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):
'width': int_or_none(f.get('width')),
'url': video_url,
})
- m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp')
- if referrer:
- m3u8_url += '?referrer=%s' % referrer
+ m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
formats.extend(self._extract_m3u8_formats(
m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py
index 364dc878e..a677ff447 100644
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -28,7 +28,7 @@ class KankanIE(InfoExtractor):
title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, 'video title')
surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0)
- gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls)
+ gcids = re.findall(r'http://.+?/.+?/(.+?)/', surls)
gcid = gcids[-1]
info_url = 'http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 08a671fa8..61739efa7 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.khanacademy.org/video/one-time-pad',
- 'md5': '7021db7f2d47d4fff89b13177cb1e8f4',
+ 'md5': '7b391cce85e758fb94f763ddc1bbb979',
'info_dict': {
'id': 'one-time-pad',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'The one-time pad',
'description': 'The perfect cipher',
'duration': 176,
diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py
new file mode 100644
index 000000000..931f34c9b
--- /dev/null
+++ b/youtube_dl/extractor/kusi.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ timeconvert,
+ update_url_query,
+ xpath_text,
+)
+
+
+class KUSIIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))'
+ _TESTS = [{
+ 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold',
+ 'md5': 'f926e7684294cf8cb7bdf8858e1b3988',
+ 'info_dict': {
+ 'id': '12203019',
+ 'ext': 'mp4',
+ 'title': 'Turko Files: Case Closed! & Put On Hold!',
+ 'duration': 231.0,
+ 'upload_date': '20160210',
+ 'timestamp': 1455087571,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ }, {
+ 'url': 'http://kusi.com/video?clipId=12203019',
+ 'info_dict': {
+ 'id': '12203019',
+ 'ext': 'mp4',
+ 'title': 'Turko Files: Case Closed! & Put On Hold!',
+ 'duration': 231.0,
+ 'upload_date': '20160210',
+ 'timestamp': 1455087571,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ 'params': {
+ 'skip_download': True, # Same as previous one
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ clip_id = mobj.group('clipId')
+ video_id = clip_id or mobj.group('path')
+
+ webpage = self._download_webpage(url, video_id)
+
+ if clip_id is None:
+ video_id = clip_id = self._html_search_regex(
+ r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id')
+
+ affiliate_id = self._search_regex(
+ r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id')
+
+ # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf
+ xml_url = update_url_query('http://www.kusi.com/build.asp', {
+ 'buildtype': 'buildfeaturexmlrequest',
+ 'featureType': 'Clip',
+ 'featureid': clip_id,
+ 'affiliateno': affiliate_id,
+ 'clientgroupid': '1',
+ 'rnd': int(round(random.random() * 1000000)),
+ })
+
+ doc = self._download_xml(xml_url, video_id)
+
+ video_title = xpath_text(doc, 'HEADLINE', fatal=True)
+ duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
+ description = xpath_text(doc, 'ABSTRACT')
+ thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
+ createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
+
+ quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
+ formats = []
+ for quality in quality_options:
+ formats.append({
+ 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']),
+ 'height': int_or_none(quality.attrib.get('height')),
+ 'width': int_or_none(quality.attrib.get('width')),
+ 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'description': description,
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'timestamp': createtion_time,
+ }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index f641edef8..700e44b63 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -68,6 +68,7 @@ class KuwoIE(KuwoBaseIE):
'id': '6446136',
'ext': 'mp3',
'title': '心',
+ 'description': 'md5:b2ab6295d014005bfc607525bfc1e38a',
'creator': 'IU',
'upload_date': '20150518',
},
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index b459559b0..5d8ebbeb3 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -1,86 +1,125 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import unicode_literals
-import random
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
+ sanitized_Request,
+ unified_strdate,
+ urlencode_postdata,
+ xpath_element,
xpath_text,
)
class Laola1TvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/.*?/(?P<id>[0-9]+)\.html'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/[^/]+/(?P<slug>[^/?#&]+)'
+ _TESTS = [{
'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html',
'info_dict': {
'id': '227883',
- 'ext': 'mp4',
+ 'display_id': 'straubing-tigers-koelner-haie',
+ 'ext': 'flv',
'title': 'Straubing Tigers - Kölner Haie',
+ 'upload_date': '20140912',
+ 'is_live': False,
'categories': ['Eishockey'],
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie',
+ 'info_dict': {
+ 'id': '464602',
+ 'display_id': 'straubing-tigers-koelner-haie',
+ 'ext': 'flv',
+ 'title': 'Straubing Tigers - Kölner Haie',
+ 'upload_date': '20160129',
'is_live': False,
+ 'categories': ['Eishockey'],
},
'params': {
'skip_download': True,
}
- }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ display_id = mobj.group('slug')
lang = mobj.group('lang')
portal = mobj.group('portal')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, display_id)
+
iframe_url = self._search_regex(
- r'<iframe[^>]*?class="main_tv_player"[^>]*?src="([^"]+)"',
- webpage, 'iframe URL')
+ r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"',
+ webpage, 'iframe url')
+
+ video_id = self._search_regex(
+ r'videoid=(\d+)', iframe_url, 'video id')
- iframe = self._download_webpage(
- iframe_url, video_id, note='Downloading iframe')
- flashvars_m = re.findall(
- r'flashvars\.([_a-zA-Z0-9]+)\s*=\s*"([^"]*)";', iframe)
- flashvars = dict((m[0], m[1]) for m in flashvars_m)
+ iframe = self._download_webpage(compat_urlparse.urljoin(
+ url, iframe_url), display_id, 'Downloading iframe')
partner_id = self._search_regex(
- r'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id')
-
- xml_url = ('http://www.laola1.tv/server/hd_video.php?' +
- 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % (
- video_id, partner_id, portal, lang))
- hd_doc = self._download_xml(xml_url, video_id)
-
- title = xpath_text(hd_doc, './/video/title', fatal=True)
- flash_url = xpath_text(hd_doc, './/video/url', fatal=True)
- uploader = xpath_text(hd_doc, './/video/meta_organistation')
- is_live = xpath_text(hd_doc, './/video/islive') == 'true'
-
- categories = xpath_text(hd_doc, './/video/meta_sports')
- if categories:
- categories = categories.split(',')
-
- ident = random.randint(10000000, 99999999)
- token_url = '%s&ident=%s&klub=0&unikey=0&timestamp=%s&auth=%s' % (
- flash_url, ident, flashvars['timestamp'], flashvars['auth'])
-
- token_doc = self._download_xml(
- token_url, video_id, note='Downloading token')
- token_attrib = token_doc.find('.//token').attrib
- if token_attrib.get('auth') in ('blocked', 'restricted'):
+ r'partnerid\s*:\s*(["\'])(?P<partner_id>.+?)\1',
+ iframe, 'partner id', group='partner_id')
+
+ hd_doc = self._download_xml(
+ 'http://www.laola1.tv/server/hd_video.php?%s'
+ % compat_urllib_parse.urlencode({
+ 'play': video_id,
+ 'partner': partner_id,
+ 'portal': portal,
+ 'lang': lang,
+ 'v5ident': '',
+ }), display_id)
+
+ _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k)
+ title = _v('title', fatal=True)
+
+ req = sanitized_Request(
+ 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' %
+ compat_urllib_parse.urlencode({
+ 'videoId': video_id,
+ 'target': '2',
+ 'label': 'laola1tv',
+ 'area': _v('area'),
+ }),
+ urlencode_postdata(
+ dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))))
+
+ token_url = self._download_json(req, display_id)['data']['stream-access'][0]
+ token_doc = self._download_xml(token_url, display_id, 'Downloading token')
+
+ token_attrib = xpath_element(token_doc, './/token').attrib
+ token_auth = token_attrib['auth']
+
+ if token_auth in ('blocked', 'restricted', 'error'):
raise ExtractorError(
- 'Token error: %s' % token_attrib.get('comment'), expected=True)
+ 'Token error: %s' % token_attrib['comment'], expected=True)
+
+ formats = self._extract_f4m_formats(
+ '%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth),
+ video_id, f4m_id='hds')
- video_url = '%s?hdnea=%s&hdcore=3.2.0' % (
- token_attrib['url'], token_attrib['auth'])
+ categories_str = _v('meta_sports')
+ categories = categories_str.split(',') if categories_str else []
return {
'id': video_id,
- 'is_live': is_live,
+ 'display_id': display_id,
'title': title,
- 'url': video_url,
- 'uploader': uploader,
+ 'upload_date': unified_strdate(_v('time_date')),
+ 'uploader': _v('meta_organisation'),
'categories': categories,
- 'ext': 'mp4',
+ 'is_live': _v('islive') == 'true',
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/leeco.py
index 9665ece89..df47e88ba 100644
--- a/youtube_dl/extractor/letv.py
+++ b/youtube_dl/extractor/leeco.py
@@ -1,36 +1,39 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import datetime
+import hashlib
import re
import time
-import base64
-import hashlib
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_ord,
compat_str,
+ compat_urllib_parse,
)
from ..utils import (
determine_ext,
+ encode_data_uri,
ExtractorError,
+ int_or_none,
+ orderedSet,
parse_iso8601,
sanitized_Request,
- int_or_none,
str_or_none,
- encode_data_uri,
url_basename,
)
-class LetvIE(InfoExtractor):
+class LeIE(InfoExtractor):
IE_DESC = '乐视网'
- _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
+ _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html'
+
+ _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
_TESTS = [{
- 'url': 'http://www.letv.com/ptv/vplay/22005890.html',
+ 'url': 'http://www.le.com/ptv/vplay/22005890.html',
'md5': 'edadcfe5406976f42f9f266057ee5e40',
'info_dict': {
'id': '22005890',
@@ -42,7 +45,7 @@ class LetvIE(InfoExtractor):
'hls_prefer_native': True,
},
}, {
- 'url': 'http://www.letv.com/ptv/vplay/1415246.html',
+ 'url': 'http://www.le.com/ptv/vplay/1415246.html',
'info_dict': {
'id': '1415246',
'ext': 'mp4',
@@ -54,7 +57,7 @@ class LetvIE(InfoExtractor):
},
}, {
'note': 'This video is available only in Mainland China, thus a proxy is needed',
- 'url': 'http://www.letv.com/ptv/vplay/1118082.html',
+ 'url': 'http://www.le.com/ptv/vplay/1118082.html',
'md5': '2424c74948a62e5f31988438979c5ad1',
'info_dict': {
'id': '1118082',
@@ -94,17 +97,16 @@ class LetvIE(InfoExtractor):
return encrypted_data
encrypted_data = encrypted_data[5:]
- _loc4_ = bytearray()
- while encrypted_data:
- b = compat_ord(encrypted_data[0])
- _loc4_.extend([b // 16, b & 0x0f])
- encrypted_data = encrypted_data[1:]
+ _loc4_ = bytearray(2 * len(encrypted_data))
+ for idx, val in enumerate(encrypted_data):
+ b = compat_ord(val)
+ _loc4_[2 * idx] = b // 16
+ _loc4_[2 * idx + 1] = b % 16
idx = len(_loc4_) - 11
_loc4_ = _loc4_[idx:] + _loc4_[:idx]
- _loc7_ = bytearray()
- while _loc4_:
- _loc7_.append(_loc4_[0] * 16 + _loc4_[1])
- _loc4_ = _loc4_[2:]
+ _loc7_ = bytearray(len(encrypted_data))
+ for i in range(len(encrypted_data)):
+ _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
return bytes(_loc7_)
@@ -117,10 +119,10 @@ class LetvIE(InfoExtractor):
'splatid': 101,
'format': 1,
'tkey': self.calc_time_key(int(time.time())),
- 'domain': 'www.letv.com'
+ 'domain': 'www.le.com'
}
play_json_req = sanitized_Request(
- 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
+ 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
)
cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
if cn_verification_proxy:
@@ -193,26 +195,51 @@ class LetvIE(InfoExtractor):
}
-class LetvTvIE(InfoExtractor):
- _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html'
+class LePlaylistIE(InfoExtractor):
+ _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)'
+
_TESTS = [{
- 'url': 'http://www.letv.com/tv/46177.html',
+ 'url': 'http://www.le.com/tv/46177.html',
'info_dict': {
'id': '46177',
'title': '美人天下',
'description': 'md5:395666ff41b44080396e59570dbac01c'
},
'playlist_count': 35
+ }, {
+ 'url': 'http://tv.le.com/izt/wuzetian/index.html',
+ 'info_dict': {
+ 'id': 'wuzetian',
+ 'title': '武媚娘传奇',
+ 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
+ },
+ # This playlist contains some extra videos other than the drama itself
+ 'playlist_mincount': 96
+ }, {
+ 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
+ # This series is moved to http://www.le.com/tv/10005297.html
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.le.com/comic/92063.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
+ 'only_matching': True,
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
+
def _real_extract(self, url):
playlist_id = self._match_id(url)
page = self._download_webpage(url, playlist_id)
- media_urls = list(set(re.findall(
- r'http://www.letv.com/ptv/vplay/\d+.html', page)))
- entries = [self.url_result(media_url, ie='Letv')
- for media_url in media_urls]
+ # Currently old domain names are still used in playlists
+ media_ids = orderedSet(re.findall(
+ r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
+ entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
+ for media_id in media_ids]
title = self._html_search_meta('keywords', page,
fatal=False).split(',')[0]
@@ -222,31 +249,9 @@ class LetvTvIE(InfoExtractor):
playlist_description=description)
-class LetvPlaylistIE(LetvTvIE):
- _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html'
- _TESTS = [{
- 'url': 'http://tv.letv.com/izt/wuzetian/index.html',
- 'info_dict': {
- 'id': 'wuzetian',
- 'title': '武媚娘传奇',
- 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
- },
- # This playlist contains some extra videos other than the drama itself
- 'playlist_mincount': 96
- }, {
- 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml',
- 'info_dict': {
- 'id': 'lswjzzjc',
- # The title should be "劲舞青春", but I can't find a simple way to
- # determine the playlist title
- 'title': '乐视午间自制剧场',
- 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489'
- },
- 'playlist_mincount': 7
- }]
-
-
class LetvCloudIE(InfoExtractor):
+ # Most of *.letv.com is changed to *.le.com on 2016/01/02
+ # but yuntv.letv.com is kept, so also keep the extractor name
IE_DESC = '乐视云'
_VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
@@ -327,7 +332,7 @@ class LetvCloudIE(InfoExtractor):
formats.append({
'url': url,
'ext': determine_ext(decoded_url),
- 'format_id': int_or_none(play_url.get('vtype')),
+ 'format_id': str_or_none(play_url.get('vtype')),
'format_note': str_or_none(play_url.get('definition')),
'width': int_or_none(play_url.get('vwidth')),
'height': int_or_none(play_url.get('vheight')),
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index f8cbca7b3..a8fd639cc 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -20,18 +20,18 @@ class LifeNewsIE(InfoExtractor):
_VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://lifenews.ru/news/126342',
- 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
+ # single video embedded via video/source
+ 'url': 'http://lifenews.ru/news/98736',
+ 'md5': '77c95eaefaca216e32a76a343ad89d23',
'info_dict': {
- 'id': '126342',
+ 'id': '98736',
'ext': 'mp4',
- 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
- 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
- 'thumbnail': 're:http://.*\.jpg',
- 'upload_date': '20140130',
+ 'title': 'Мужчина нашел дома архив оборонного завода',
+ 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+ 'upload_date': '20120805',
}
}, {
- # video in <iframe>
+ # single video embedded via iframe
'url': 'http://lifenews.ru/news/152125',
'md5': '77d19a6f0886cd76bdbf44b4d971a273',
'info_dict': {
@@ -42,15 +42,33 @@ class LifeNewsIE(InfoExtractor):
'upload_date': '20150402',
}
}, {
+ # two videos embedded via iframe
'url': 'http://lifenews.ru/news/153461',
- 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
'info_dict': {
'id': '153461',
- 'ext': 'mp4',
'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
'upload_date': '20150505',
- }
+ },
+ 'playlist': [{
+ 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+ 'info_dict': {
+ 'id': '153461-video1',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'upload_date': '20150505',
+ },
+ }, {
+ 'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322',
+ 'info_dict': {
+ 'id': '153461-video2',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'upload_date': '20150505',
+ },
+ }],
}, {
'url': 'http://lifenews.ru/video/13035',
'only_matching': True,
@@ -65,10 +83,14 @@ class LifeNewsIE(InfoExtractor):
'http://lifenews.ru/%s/%s' % (section, video_id),
video_id, 'Downloading page')
- videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
- iframe_link = self._html_search_regex(
- '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
- if not videos and not iframe_link:
+ video_urls = re.findall(
+ r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
+
+ iframe_links = re.findall(
+ r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']',
+ webpage)
+
+ if not video_urls and not iframe_links:
raise ExtractorError('No media links available for %s' % video_id)
title = remove_end(
@@ -95,31 +117,44 @@ class LifeNewsIE(InfoExtractor):
'upload_date': upload_date,
}
- def make_entry(video_id, media, video_number=None):
+ def make_entry(video_id, video_url, index=None):
cur_info = dict(common_info)
cur_info.update({
- 'id': video_id,
- 'url': media[1],
- 'thumbnail': media[0],
- 'title': title if video_number is None else '%s-video%s' % (title, video_number),
+ 'id': video_id if not index else '%s-video%s' % (video_id, index),
+ 'url': video_url,
+ 'title': title if not index else '%s (Видео %s)' % (title, index),
})
return cur_info
- if iframe_link:
- iframe_link = self._proto_relative_url(iframe_link, 'http:')
- cur_info = dict(common_info)
- cur_info.update({
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': title,
- 'url': iframe_link,
- })
+ def make_video_entry(video_id, video_url, index=None):
+ video_url = compat_urlparse.urljoin(url, video_url)
+ return make_entry(video_id, video_url, index)
+
+ def make_iframe_entry(video_id, video_url, index=None):
+ video_url = self._proto_relative_url(video_url, 'http:')
+ cur_info = make_entry(video_id, video_url, index)
+ cur_info['_type'] = 'url_transparent'
return cur_info
- if len(videos) == 1:
- return make_entry(video_id, videos[0])
- else:
- return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+ if len(video_urls) == 1 and not iframe_links:
+ return make_video_entry(video_id, video_urls[0])
+
+ if len(iframe_links) == 1 and not video_urls:
+ return make_iframe_entry(video_id, iframe_links[0])
+
+ entries = []
+
+ if video_urls:
+ for num, video_url in enumerate(video_urls, 1):
+ entries.append(make_video_entry(video_id, video_url, num))
+
+ if iframe_links:
+ for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1):
+ entries.append(make_iframe_entry(video_id, iframe_link, num))
+
+ playlist = common_info.copy()
+ playlist.update(self.playlist_result(entries, video_id, title, description))
+ return playlist
class LifeEmbedIE(InfoExtractor):
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 857edfde2..4684994e1 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -47,7 +47,7 @@ class LiveLeakIE(InfoExtractor):
'info_dict': {
'id': '801_1409392012',
'ext': 'mp4',
- 'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+ 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
'uploader': 'bony333',
'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
}
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 38fb3d9e4..eada7c299 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -14,6 +14,7 @@ from ..utils import (
xpath_with_ns,
xpath_text,
orderedSet,
+ update_url_query,
int_or_none,
float_or_none,
parse_iso8601,
@@ -64,7 +65,7 @@ class LivestreamIE(InfoExtractor):
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base_ele = find_xpath_attr(
smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
- base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/'
+ base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
formats = []
video_nodes = smil.findall(self._xpath_ns('.//video', namespace))
@@ -72,7 +73,10 @@ class LivestreamIE(InfoExtractor):
for vn in video_nodes:
tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)
furl = (
- '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src']))
+ update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), {
+ 'v': '3.0.3',
+ 'fp': 'WIN% 14,0,0,145',
+ }))
if 'clipBegin' in vn.attrib:
furl += '&ssek=' + vn.attrib['clipBegin']
formats.append({
diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py
new file mode 100644
index 000000000..f5d00e61d
--- /dev/null
+++ b/youtube_dl/extractor/makerschannel.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MakersChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849',
+ 'md5': '624a512c6969236b5967bf9286345ad1',
+ 'info_dict': {
+ 'id': '849',
+ 'ext': 'mp4',
+ 'title': 'Landing a bus on a plane is an epic win',
+ 'uploader': 'ZoomIn',
+ 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139',
+ }
+ }
+
+ def _real_extract(self, url):
+ id_type, url_id = re.match(self._VALID_URL, url).groups()
+ webpage = self._download_webpage(url, url_id)
+ video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
+
+ def extract_data_val(attr, fatal=False):
+ return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal)
+ minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'minoto:%s' % minoto_id,
+ 'id': extract_data_val('video-id', True),
+ 'title': extract_data_val('title', True),
+ 'description': extract_data_val('description'),
+ 'thumbnail': extract_data_val('image'),
+ 'uploader': extract_data_val('channel'),
+ }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 425fc9e2a..2338e7f96 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -14,7 +14,7 @@ from ..utils import (
class MDRIE(InfoExtractor):
IE_DESC = 'MDR.DE and KiKA'
- _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
+ _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P<id>\d+)(?:_.+?)?\.html'
_TESTS = [{
# MDR regularly deletes its videos
@@ -60,6 +60,9 @@ class MDRIE(InfoExtractor):
}, {
'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -68,8 +71,8 @@ class MDRIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
data_url = self._search_regex(
- r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
- webpage, 'data url', group='url')
+ r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1',
+ webpage, 'data url', default=None, group='url').replace('\/', '/')
doc = self._download_xml(
compat_urlparse.urljoin(url, data_url), video_id)
diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py
new file mode 100644
index 000000000..959a10589
--- /dev/null
+++ b/youtube_dl/extractor/minoto.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MinotoIE(InfoExtractor):
+ _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ player_id = mobj.group('player_id') or '1'
+ video_id = mobj.group('id')
+ video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
+ video_metadata = video_data['video-metadata']
+ formats = []
+ for fmt in video_data['video-files']:
+ fmt_url = fmt.get('url')
+ if not fmt_url:
+ continue
+ container = fmt.get('container')
+ if container == 'hls':
+ formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ else:
+ fmt_profile = fmt.get('profile') or {}
+ f = {
+ 'format_id': fmt_profile.get('name-short'),
+ 'format_note': fmt_profile.get('name'),
+ 'url': fmt_url,
+ 'container': container,
+ 'tbr': int_or_none(fmt.get('bitrate')),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'width': int_or_none(fmt.get('width')),
+ 'height': int_or_none(fmt.get('height')),
+ }
+ codecs = fmt.get('codecs')
+ if codecs:
+ codecs = codecs.split(',')
+ if len(codecs) == 2:
+ f.update({
+ 'vcodec': codecs[0],
+ 'acodec': codecs[1],
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_metadata['title'],
+ 'description': video_metadata.get('description'),
+ 'thumbnail': video_metadata.get('video-poster', {}).get('url'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 29ca45778..819c1b90b 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -99,7 +99,7 @@ class OCWMITIE(InfoExtractor):
'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
'info_dict': {
'id': 'EObHWIEKGjA',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
'upload_date': '20121109',
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index c2b7ed9ab..101497118 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
HEADRequest,
+ parse_count,
str_to_int,
)
@@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor):
uploader_id = self._search_regex(
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
- like_count = str_to_int(self._search_regex(
- r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
+ like_count = parse_count(self._search_regex(
+ r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
webpage, 'like count', fatal=False))
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
index f8226cbb2..e47c80119 100644
--- a/youtube_dl/extractor/mofosex.py
+++ b/youtube_dl/extractor/mofosex.py
@@ -38,7 +38,7 @@ class MofosexIE(InfoExtractor):
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
- format = "-".join(format)
+ format = '-'.join(format)
age_limit = self._rta_search(webpage)
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
index 97d5da626..0b4787c1d 100644
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
str_to_int,
unified_strdate,
)
@@ -12,55 +13,62 @@ from ..utils import (
class MotherlessIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
- _TESTS = [
- {
- 'url': 'http://motherless.com/AC3FFE1',
- 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
- 'info_dict': {
- 'id': 'AC3FFE1',
- 'ext': 'mp4',
- 'title': 'Fucked in the ass while playing PS3',
- 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
- 'upload_date': '20100913',
- 'uploader_id': 'famouslyfuckedup',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
- },
- {
- 'url': 'http://motherless.com/532291B',
- 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
- 'info_dict': {
- 'id': '532291B',
- 'ext': 'mp4',
- 'title': 'Amazing girl playing the omegle game, PERFECT!',
- 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'],
- 'upload_date': '20140622',
- 'uploader_id': 'Sulivana7x',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
+ _TESTS = [{
+ 'url': 'http://motherless.com/AC3FFE1',
+ 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
+ 'info_dict': {
+ 'id': 'AC3FFE1',
+ 'ext': 'mp4',
+ 'title': 'Fucked in the ass while playing PS3',
+ 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
+ 'upload_date': '20100913',
+ 'uploader_id': 'famouslyfuckedup',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'http://motherless.com/532291B',
+ 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
+ 'info_dict': {
+ 'id': '532291B',
+ 'ext': 'mp4',
+ 'title': 'Amazing girl playing the omegle game, PERFECT!',
+ 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
+ 'game', 'hairy'],
+ 'upload_date': '20140622',
+ 'uploader_id': 'Sulivana7x',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
},
- {
- 'url': 'http://motherless.com/g/cosplay/633979F',
- 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
- 'info_dict': {
- 'id': '633979F',
- 'ext': 'mp4',
- 'title': 'Turtlette',
- 'categories': ['superheroine heroine superher'],
- 'upload_date': '20140827',
- 'uploader_id': 'shade0230',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
+ 'skip': '404',
+ }, {
+ 'url': 'http://motherless.com/g/cosplay/633979F',
+ 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+ 'info_dict': {
+ 'id': '633979F',
+ 'ext': 'mp4',
+ 'title': 'Turtlette',
+ 'categories': ['superheroine heroine superher'],
+ 'upload_date': '20140827',
+ 'uploader_id': 'shade0230',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
}
- ]
+ }, {
+ # no keywords
+ 'url': 'http://motherless.com/8B4BBC1',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ if any(p in webpage for p in (
+ '<title>404 - MOTHERLESS.COM<',
+ ">The page you're looking for cannot be found.<")):
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
title = self._html_search_regex(
r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
video_url = self._html_search_regex(
@@ -86,7 +94,7 @@ class MotherlessIE(InfoExtractor):
r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
webpage, 'uploader_id')
- categories = self._html_search_meta('keywords', webpage)
+ categories = self._html_search_meta('keywords', webpage, default=None)
if categories:
categories = [cat.strip() for cat in categories.split(',')]
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index e8bb527b8..ed068365d 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -11,6 +11,7 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
+ float_or_none,
HEADRequest,
sanitized_Request,
unescapeHTML,
@@ -110,7 +111,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
uri = itemdoc.find('guid').text
video_id = self._id_from_uri(uri)
self.report_extraction(video_id)
- mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+ content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content')))
+ mediagen_url = content_el.attrib['url']
# Remove the templates, like &device={device}
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
if 'acceptMethods' not in mediagen_url:
@@ -165,6 +167,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
'id': video_id,
'thumbnail': self._get_thumbnail_url(uri, itemdoc),
'description': description,
+ 'duration': float_or_none(content_el.attrib.get('duration')),
}
def _get_feed_query(self, uri):
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 4557a2b13..f936b92bb 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -18,8 +18,8 @@ class MySpassIE(InfoExtractor):
'info_dict': {
'id': '11741',
'ext': 'mp4',
- "description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
- "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2",
+ 'description': 'Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?',
+ 'title': 'Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2',
},
}
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 36ab388b2..1e21cf98a 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -19,6 +19,7 @@ from ..utils import (
class MyVideoIE(InfoExtractor):
+ _WORKING = False
_VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*'
IE_NAME = 'myvideo'
_TEST = {
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index a071378b6..3e2b3e599 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -1,18 +1,26 @@
from __future__ import unicode_literals
+import functools
+import os.path
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urlparse,
+)
from ..utils import (
- parse_duration,
int_or_none,
+ OnDemandPagedList,
+ parse_duration,
+ remove_start,
xpath_text,
xpath_attr,
)
class NBAIE(InfoExtractor):
- _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
_TESTS = [{
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
'md5': '9e7729d3010a9c71506fd1248f74e4f4',
@@ -44,14 +52,101 @@ class NBAIE(InfoExtractor):
'timestamp': 1432134543,
'upload_date': '20150520',
}
+ }, {
+ 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake',
+ 'info_dict': {
+ 'id': '1455672027478-Doc_Feb16_720',
+ 'ext': 'mp4',
+ 'title': 'Practice: Doc Rivers - 2/16/16',
+ 'description': 'Head Coach Doc Rivers addresses the media following practice.',
+ 'upload_date': '20160217',
+ 'timestamp': 1455672000,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+ 'info_dict': {
+ 'id': 'timberwolves',
+ 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+ },
+ 'playlist_count': 30,
+ 'params': {
+ # Download the whole playlist takes too long time
+ 'playlist_items': '1-30',
+ },
+ }, {
+ 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+ 'info_dict': {
+ 'id': 'Wigginsmp4',
+ 'ext': 'mp4',
+ 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+ 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.',
+ 'upload_date': '20141212',
+ 'timestamp': 1418418600,
+ },
+ 'params': {
+ 'noplaylist': True,
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
+ _PAGE_SIZE = 30
+
+ def _fetch_page(self, team, video_id, page):
+ search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse.urlencode({
+ 'type': 'teamvideo',
+ 'start': page * self._PAGE_SIZE + 1,
+ 'npp': (page + 1) * self._PAGE_SIZE + 1,
+ 'sort': 'recent',
+ 'output': 'json',
+ 'site': team,
+ })
+ results = self._download_json(
+ search_url, video_id, note='Download page %d of playlist data' % page)['results'][0]
+ for item in results:
+ yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url']))
+
+ def _extract_playlist(self, orig_path, video_id, webpage):
+ team = orig_path.split('/')[0]
+
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video because of --no-playlist')
+ video_path = self._search_regex(
+ r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path')
+ video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path)
+ return self.url_result(video_url)
+
+ self.to_screen('Downloading playlist - add --no-playlist to just download video')
+ playlist_title = self._og_search_title(webpage, fatal=False)
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, team, video_id),
+ self._PAGE_SIZE, use_cache=True)
+
+ return self.playlist_result(entries, team, playlist_title)
+
def _real_extract(self, url):
path, video_id = re.match(self._VALID_URL, url).groups()
+ orig_path = path
if path.startswith('nba/'):
path = path[3:]
+
+ if 'video/' not in path:
+ webpage = self._download_webpage(url, video_id)
+ path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/')
+
+ if path == '{{id}}':
+ return self._extract_playlist(orig_path, video_id, webpage)
+
+ # See prepareContentId() of pkgCvp.js
+ if path.startswith('video/teams'):
+ path = 'video/channels/proxy/' + path[6:]
+
video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id)
- video_id = xpath_text(video_info, 'slug')
+ video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0]
title = xpath_text(video_info, 'headline')
description = xpath_text(video_info, 'description')
duration = parse_duration(xpath_text(video_info, 'length'))
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 18d01f423..2202cfa33 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -57,7 +57,7 @@ class NBCIE(InfoExtractor):
{
# This video has expired but with an escaped embedURL
'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
- 'skip': 'Expired'
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/nerdcubed.py b/youtube_dl/extractor/nerdcubed.py
index dff78e486..9feccc672 100644
--- a/youtube_dl/extractor/nerdcubed.py
+++ b/youtube_dl/extractor/nerdcubed.py
@@ -18,14 +18,14 @@ class NerdCubedFeedIE(InfoExtractor):
}
def _real_extract(self, url):
- feed = self._download_json(url, url, "Downloading NerdCubed JSON feed")
+ feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed')
entries = [{
'_type': 'url',
'title': feed_entry['title'],
'uploader': feed_entry['source']['name'] if feed_entry['source'] else None,
'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'),
- 'url': "http://www.youtube.com/watch?v=" + feed_entry['youtube_id'],
+ 'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'],
} for feed_entry in feed]
return {
diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py
new file mode 100644
index 000000000..4e60b13a5
--- /dev/null
+++ b/youtube_dl/extractor/noz.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+ int_or_none,
+ xpath_text,
+ update_url_query,
+)
+
+
+class NozIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?noz\.de/video/(?P<id>[0-9]+)/'
+ _TESTS = [{
+ 'url': 'http://www.noz.de/video/25151/32-Deutschland-gewinnt-Badminton-Lnderspiel-in-Melle',
+ 'info_dict': {
+ 'id': '25151',
+ 'ext': 'mp4',
+ 'duration': 215,
+ 'title': '3:2 - Deutschland gewinnt Badminton-Länderspiel in Melle',
+ 'description': 'Vor rund 370 Zuschauern gewinnt die deutsche Badminton-Nationalmannschaft am Donnerstag ein EM-Vorbereitungsspiel gegen Frankreich in Melle. Video Moritz Frankenberg.',
+ 'thumbnail': 're:^http://.*\.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ description = self._og_search_description(webpage)
+
+ edge_url = self._html_search_regex(
+ r'<script\s+(?:type="text/javascript"\s+)?src="(.*?/videojs_.*?)"',
+ webpage, 'edge URL')
+ edge_content = self._download_webpage(edge_url, 'meta configuration')
+
+ config_url_encoded = self._search_regex(
+ r'so\.addVariable\("config_url","[^,]*,(.*?)"',
+ edge_content, 'config URL'
+ )
+ config_url = compat_urllib_parse_unquote(config_url_encoded)
+
+ doc = self._download_xml(config_url, 'video configuration')
+ title = xpath_text(doc, './/title')
+ thumbnail = xpath_text(doc, './/article/thumbnail/url')
+ duration = int_or_none(xpath_text(
+ doc, './/article/movie/file/duration'))
+ formats = []
+ for qnode in doc.findall('.//article/movie/file/qualities/qual'):
+ http_url = xpath_text(
+ qnode, './html_urls/video_url[@format="video/mp4"]')
+ if http_url:
+ formats.append({
+ 'url': http_url,
+ 'format_name': xpath_text(qnode, './name'),
+ 'format_id': '%s-%s' % ('http', xpath_text(qnode, './id')),
+ 'height': int_or_none(xpath_text(qnode, './height')),
+ 'width': int_or_none(xpath_text(qnode, './width')),
+ 'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000),
+ })
+ else:
+ f4m_url = xpath_text(qnode, 'url_hd2')
+ if f4m_url:
+ formats.extend(self._extract_f4m_formats(
+ update_url_query(f4m_url, {'hdcore': '3.4.0'}),
+ video_id, f4m_id='hds', fatal=False))
+ m3u8_url = xpath_text(
+ qnode, './html_urls/video_url[@format="application/vnd.apple.mpegurl"]')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'duration': duration,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index a126f5054..3b21fbd4d 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_urllib_parse_unquote,
+)
from ..utils import (
determine_ext,
ExtractorError,
@@ -87,7 +90,7 @@ class NRKIE(InfoExtractor):
class NRKPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -126,6 +129,37 @@ class NRKPlaylistIE(InfoExtractor):
entries, playlist_id, playlist_title, playlist_description)
+class NRKSkoleIE(InfoExtractor):
+ IE_DESC = 'NRK Skole'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532',
+ 'md5': '04cd85877cc1913bce73c5d28a47e00f',
+ 'info_dict': {
+ 'id': '6021',
+ 'ext': 'flv',
+ 'title': 'Genetikk og eneggede tvillinger',
+ 'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
+ 'duration': 399,
+ },
+ }, {
+ 'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = compat_urllib_parse_unquote(self._match_id(url))
+
+ webpage = self._download_webpage(url, video_id)
+
+ nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')
+ return self.url_result('nrk:%s' % nrk_id)
+
+
class NRKTVIE(InfoExtractor):
IE_DESC = 'NRK TV and NRK Radio'
_VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index c54775d54..958eb398b 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -112,6 +112,7 @@ class ORFTVthekIE(InfoExtractor):
% geo_str),
fatal=False)
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
upload_date = unified_strdate(sd['created_date'])
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index cbd1efea0..f43e3a146 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
determine_ext,
@@ -200,7 +201,7 @@ class PBSIE(InfoExtractor):
'id': '2365006249',
'ext': 'mp4',
'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
- 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
+ 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071',
'duration': 3190,
},
'params': {
@@ -214,7 +215,7 @@ class PBSIE(InfoExtractor):
'id': '2365297690',
'ext': 'mp4',
'title': 'FRONTLINE - Losing Iraq',
- 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+ 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9',
'duration': 5050,
},
'params': {
@@ -228,7 +229,7 @@ class PBSIE(InfoExtractor):
'id': '2201174722',
'ext': 'mp4',
'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
- 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
+ 'description': 'md5:95a19f568689d09a166dff9edada3301',
'duration': 801,
},
},
@@ -238,8 +239,8 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2365297708',
'ext': 'mp4',
- 'description': 'md5:68d87ef760660eb564455eb30ca464fe',
'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
+ 'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b',
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@ -279,7 +280,7 @@ class PBSIE(InfoExtractor):
'display_id': 'player',
'ext': 'mp4',
'title': 'American Experience - Death and the Civil War, Chapter 1',
- 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
+ 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d',
'duration': 682,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@ -288,20 +289,19 @@ class PBSIE(InfoExtractor):
},
},
{
- 'url': 'http://video.pbs.org/video/2365367186/',
+ 'url': 'http://www.pbs.org/video/2365245528/',
'info_dict': {
- 'id': '2365367186',
- 'display_id': '2365367186',
+ 'id': '2365245528',
+ 'display_id': '2365245528',
'ext': 'mp4',
- 'title': 'To Catch A Comet - Full Episode',
- 'description': 'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.',
- 'duration': 3342,
+ 'title': 'FRONTLINE - United States of Secrets (Part One)',
+ 'description': 'md5:55756bd5c551519cc4b7703e373e217e',
+ 'duration': 6851,
'thumbnail': 're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True, # requires ffmpeg
},
- 'skip': 'Expired',
},
{
# Video embedded in iframe containing angle brackets as attribute's value (e.g.
@@ -313,7 +313,7 @@ class PBSIE(InfoExtractor):
'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
'ext': 'mp4',
'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
- 'description': 'md5:61db2ddf27c9912f09c241014b118ed1',
+ 'description': 'md5:54033c6baa1f9623607c6e2ed245888b',
'duration': 1480,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@ -329,7 +329,7 @@ class PBSIE(InfoExtractor):
'display_id': 'the-atomic-artists',
'ext': 'mp4',
'title': 'FRONTLINE - The Atomic Artists',
- 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+ 'description': 'md5:1a2481e86b32b2e12ec1905dd473e2c1',
'duration': 723,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@ -338,6 +338,21 @@ class PBSIE(InfoExtractor):
},
},
{
+ # Serves hd only via wigget/partnerplayer page
+ 'url': 'http://www.pbs.org/video/2365641075/',
+ 'info_dict': {
+ 'id': '2365641075',
+ 'ext': 'mp4',
+ 'title': 'FRONTLINE - Netanyahu at War',
+ 'duration': 6852,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'formats': 'mincount:8',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
},
@@ -366,10 +381,14 @@ class PBSIE(InfoExtractor):
webpage, 'upload date', default=None))
# tabbed frontline videos
- tabbed_videos = re.findall(
- r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage)
- if tabbed_videos:
- return tabbed_videos, presumptive_id, upload_date
+ MULTI_PART_REGEXES = (
+ r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
+ r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+ )
+ for p in MULTI_PART_REGEXES:
+ tabbed_videos = re.findall(p, webpage)
+ if tabbed_videos:
+ return tabbed_videos, presumptive_id, upload_date
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
@@ -433,33 +452,54 @@ class PBSIE(InfoExtractor):
for vid_id in video_id]
return self.playlist_result(entries, display_id)
- player = self._download_webpage(
- 'http://player.pbs.org/portalplayer/%s' % video_id, display_id)
-
- info = self._parse_json(
- self._search_regex(
- r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
- player, 'video data', default='{}'),
- display_id, transform_source=js_to_json, fatal=False)
-
- # Fallback to old videoInfo API
- if not info:
- info = self._download_json(
+ info = None
+ redirects = []
+ redirect_urls = set()
+
+ def extract_redirect_urls(info):
+ for encoding_name in ('recommended_encoding', 'alternate_encoding'):
+ redirect = info.get(encoding_name)
+ if not redirect:
+ continue
+ redirect_url = redirect.get('url')
+ if redirect_url and redirect_url not in redirect_urls:
+ redirects.append(redirect)
+ redirect_urls.add(redirect_url)
+
+ try:
+ video_info = self._download_json(
'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
display_id, 'Downloading video info JSON')
+ extract_redirect_urls(video_info)
+ info = video_info
+ except ExtractorError as e:
+ # videoInfo API may not work for some videos
+ if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404:
+ raise
+
+ # Player pages may also serve different qualities
+ for page in ('widget/partnerplayer', 'portalplayer'):
+ player = self._download_webpage(
+ 'http://player.pbs.org/%s/%s' % (page, video_id),
+ display_id, 'Downloading %s page' % page, fatal=False)
+ if player:
+ video_info = self._parse_json(
+ self._search_regex(
+ r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+ player, '%s video data' % page, default='{}'),
+ display_id, transform_source=js_to_json, fatal=False)
+ if video_info:
+ extract_redirect_urls(video_info)
+ if not info:
+ info = video_info
formats = []
- for encoding_name in ('recommended_encoding', 'alternate_encoding'):
- redirect = info.get(encoding_name)
- if not redirect:
- continue
- redirect_url = redirect.get('url')
- if not redirect_url:
- continue
+ for num, redirect in enumerate(redirects):
+ redirect_id = redirect.get('eeid')
redirect_info = self._download_json(
- redirect_url + '?format=json', display_id,
- 'Downloading %s video url info' % encoding_name)
+ '%s?format=json' % redirect['url'], display_id,
+ 'Downloading %s video url info' % (redirect_id or num))
if redirect_info['status'] == 'error':
raise ExtractorError(
@@ -478,8 +518,9 @@ class PBSIE(InfoExtractor):
else:
formats.append({
'url': format_url,
- 'format_id': redirect.get('eeid'),
+ 'format_id': redirect_id,
})
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
rating_str = info.get('rating')
@@ -505,7 +546,7 @@ class PBSIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'title': info['title'],
- 'description': info['program'].get('description'),
+ 'description': info.get('description') or info.get('program', {}).get('description'),
'thumbnail': info.get('image_url'),
'duration': int_or_none(info.get('duration')),
'age_limit': age_limit,
diff --git a/youtube_dl/extractor/plays.py b/youtube_dl/extractor/plays.py
index 2aba7cb9c..c3c38cf4a 100644
--- a/youtube_dl/extractor/plays.py
+++ b/youtube_dl/extractor/plays.py
@@ -26,8 +26,9 @@ class PlaysTVIE(InfoExtractor):
title = self._og_search_title(webpage)
content = self._parse_json(
- self._search_regex(r'R\.bindContent\(({.+?})\);', webpage,
- 'content'), video_id)['content']
+ self._search_regex(
+ r'R\.bindContent\(({.+?})\);', webpage,
+ 'content'), video_id)['content']
mpd_url, sources = re.search(
r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>',
content).groups()
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 08275687d..5a55c25e7 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -11,6 +11,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ int_or_none,
sanitized_Request,
str_to_int,
)
@@ -23,13 +24,18 @@ class PornHubIE(InfoExtractor):
_VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
_TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
- 'md5': '882f488fa1f0026f023f33576004a2ed',
+ 'md5': '1e19b41231a02eba417839222ac9d58e',
'info_dict': {
'id': '648719015',
'ext': 'mp4',
- "uploader": "Babes",
- "title": "Seductive Indian beauty strips down and fingers her pink pussy",
- "age_limit": 18
+ 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
+ 'uploader': 'Babes',
+ 'duration': 361,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
}
}, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
@@ -67,13 +73,23 @@ class PornHubIE(InfoExtractor):
'PornHub said: %s' % error_msg,
expected=True, video_id=video_id)
- video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
+ flashvars = self._parse_json(
+ self._search_regex(
+ r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
+ video_id)
+ if flashvars:
+ video_title = flashvars.get('video_title')
+ thumbnail = flashvars.get('image_url')
+ duration = int_or_none(flashvars.get('video_duration'))
+ else:
+ video_title, thumbnail, duration = [None] * 3
+
+ if not video_title:
+ video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
+
video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
- thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
- if thumbnail:
- thumbnail = compat_urllib_parse_unquote(thumbnail)
view_count = self._extract_count(
r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
@@ -95,7 +111,7 @@ class PornHubIE(InfoExtractor):
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
- format = "-".join(format)
+ format = '-'.join(format)
m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
if m is None:
@@ -120,6 +136,7 @@ class PornHubIE(InfoExtractor):
'uploader': video_uploader,
'title': video_title,
'thumbnail': thumbnail,
+ 'duration': duration,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
@@ -129,27 +146,20 @@ class PornHubIE(InfoExtractor):
}
-class PornHubPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
- _TESTS = [{
- 'url': 'http://www.pornhub.com/playlist/6201671',
- 'info_dict': {
- 'id': '6201671',
- 'title': 'P0p4',
- },
- 'playlist_mincount': 35,
- }]
+class PornHubPlaylistBaseIE(InfoExtractor):
+ def _extract_entries(self, webpage):
+ return [
+ self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key())
+ for video_url in set(re.findall(
+ r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage))
+ ]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- entries = [
- self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
- for video_url in set(re.findall(
- r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage))
- ]
+ entries = self._extract_entries(webpage)
playlist = self._parse_json(
self._search_regex(
@@ -158,3 +168,33 @@ class PornHubPlaylistIE(InfoExtractor):
return self.playlist_result(
entries, playlist_id, playlist.get('title'), playlist.get('description'))
+
+
+class PornHubPlaylistIE(PornHubPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.pornhub.com/playlist/6201671',
+ 'info_dict': {
+ 'id': '6201671',
+ 'title': 'P0p4',
+ },
+ 'playlist_mincount': 35,
+ }]
+
+
+class PornHubUserVideosIE(PornHubPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'
+ _TESTS = [{
+ 'url': 'http://www.pornhub.com/users/rushandlia/videos',
+ 'info_dict': {
+ 'id': 'rushandlia',
+ },
+ 'playlist_mincount': 13,
+ }]
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, user_id)
+
+ return self.playlist_result(self._extract_entries(webpage), user_id)
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py
index eba4dfbb3..1a53fd71c 100644
--- a/youtube_dl/extractor/pornovoisines.py
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -56,7 +56,7 @@ class PornoVoisinesIE(InfoExtractor):
r'<h1>(.+?)</h1>', webpage, 'title', flags=re.DOTALL)
description = self._html_search_regex(
r'<article id="descriptif">(.+?)</article>',
- webpage, "description", fatal=False, flags=re.DOTALL)
+ webpage, 'description', fatal=False, flags=re.DOTALL)
thumbnail = self._search_regex(
r'<div id="mediaspace%s">\s*<img src="/?([^"]+)"' % video_id,
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
index 6d5732d45..30a5f2de4 100644
--- a/youtube_dl/extractor/pyvideo.py
+++ b/youtube_dl/extractor/pyvideo.py
@@ -12,14 +12,14 @@ class PyvideoIE(InfoExtractor):
_TESTS = [
{
'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
- 'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+ 'md5': '520915673e53a5c5d487c36e0c4d85b5',
'info_dict': {
'id': '24_4WWkSmNo',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Become a logging expert in 30 minutes',
'description': 'md5:9665350d466c67fb5b1598de379021f7',
'upload_date': '20130320',
- 'uploader': 'NextDayVideo',
+ 'uploader': 'Next Day Video',
'uploader_id': 'NextDayVideo',
},
'add_ie': ['Youtube'],
diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py
index 0d706312e..0cbb15f08 100644
--- a/youtube_dl/extractor/radiobremen.py
+++ b/youtube_dl/extractor/radiobremen.py
@@ -28,16 +28,16 @@ class RadioBremenIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- meta_url = "http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s" % video_id
+ meta_url = 'http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s' % video_id
meta_doc = self._download_webpage(
meta_url, video_id, 'Downloading metadata')
title = self._html_search_regex(
- r"<h1.*>(?P<title>.+)</h1>", meta_doc, "title")
+ r'<h1.*>(?P<title>.+)</h1>', meta_doc, 'title')
description = self._html_search_regex(
- r"<p>(?P<description>.*)</p>", meta_doc, "description", fatal=False)
+ r'<p>(?P<description>.*)</p>', meta_doc, 'description', fatal=False)
duration = parse_duration(self._html_search_regex(
- r"L&auml;nge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>",
- meta_doc, "duration", fatal=False))
+ r'L&auml;nge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>',
+ meta_doc, 'duration', fatal=False))
page_doc = self._download_webpage(
url, video_id, 'Downloading video information')
@@ -51,7 +51,7 @@ class RadioBremenIE(InfoExtractor):
formats = [{
'url': video_url,
'ext': 'mp4',
- 'width': int(mobj.group("width")),
+ 'width': int(mobj.group('width')),
}]
return {
'id': video_id,
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py
index 09352ed82..a8afc0014 100644
--- a/youtube_dl/extractor/radiofrance.py
+++ b/youtube_dl/extractor/radiofrance.py
@@ -16,9 +16,9 @@ class RadioFranceIE(InfoExtractor):
'info_dict': {
'id': 'one-one',
'ext': 'ogg',
- "title": "One to one",
- "description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
- "uploader": "Thomas Hercouët",
+ 'title': 'One to one',
+ 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+ 'uploader': 'Thomas Hercouët',
},
}
diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py
index 0f8f3ebde..7932af6ef 100644
--- a/youtube_dl/extractor/rbmaradio.py
+++ b/youtube_dl/extractor/rbmaradio.py
@@ -18,11 +18,11 @@ class RBMARadioIE(InfoExtractor):
'info_dict': {
'id': 'ford-lopatin-live-at-primavera-sound-2011',
'ext': 'mp3',
- "uploader_id": "ford-lopatin",
- "location": "Spain",
- "description": "Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.",
- "uploader": "Ford & Lopatin",
- "title": "Live at Primavera Sound 2011",
+ 'uploader_id': 'ford-lopatin',
+ 'location': 'Spain',
+ 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.',
+ 'uploader': 'Ford & Lopatin',
+ 'title': 'Live at Primavera Sound 2011',
},
}
diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py
index ec7e7df7b..3c6725aeb 100644
--- a/youtube_dl/extractor/reverbnation.py
+++ b/youtube_dl/extractor/reverbnation.py
@@ -12,12 +12,12 @@ class ReverbNationIE(InfoExtractor):
'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa',
'md5': '3da12ebca28c67c111a7f8b262d3f7a7',
'info_dict': {
- "id": "16965047",
- "ext": "mp3",
- "title": "MONA LISA",
- "uploader": "ALKILADOS",
- "uploader_id": "216429",
- "thumbnail": "re:^https://gp1\.wac\.edgecastcdn\.net/.*?\.jpg$"
+ 'id': '16965047',
+ 'ext': 'mp3',
+ 'title': 'MONA LISA',
+ 'uploader': 'ALKILADOS',
+ 'uploader_id': '216429',
+ 'thumbnail': 're:^https://gp1\.wac\.edgecastcdn\.net/.*?\.jpg$'
},
}]
diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py
index b1b8800b9..99979ebe1 100644
--- a/youtube_dl/extractor/revision3.py
+++ b/youtube_dl/extractor/revision3.py
@@ -19,7 +19,7 @@ class Revision3IE(InfoExtractor):
'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
'md5': 'd94a72d85d0a829766de4deb8daaf7df',
'info_dict': {
- 'id': '73034',
+ 'id': '71089',
'display_id': 'technobuffalo/5-google-predictions-for-2016',
'ext': 'webm',
'title': '5 Google Predictions for 2016',
@@ -31,6 +31,7 @@ class Revision3IE(InfoExtractor):
'uploader_id': 'technobuffalo',
}
}, {
+ # Show
'url': 'http://testtube.com/brainstuff',
'info_dict': {
'id': '251',
@@ -41,7 +42,7 @@ class Revision3IE(InfoExtractor):
}, {
'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
'info_dict': {
- 'id': '60163',
+ 'id': '58227',
'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
'duration': 275,
'ext': 'webm',
@@ -52,18 +53,72 @@ class Revision3IE(InfoExtractor):
'uploader': 'DNews',
'uploader_id': 'dnews',
},
+ }, {
+ 'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
+ 'info_dict': {
+ 'id': '71618',
+ 'ext': 'mp4',
+ 'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
+ 'title': 'The Israel-Palestine Conflict Explained in Ten Minutes',
+ 'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start',
+ 'uploader': 'Editors\' Picks',
+ 'uploader_id': 'tt-editors-picks',
+ 'timestamp': 1453309200,
+ 'upload_date': '20160120',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ # Tag
+ 'url': 'http://testtube.com/tech-news',
+ 'info_dict': {
+ 'id': '21018',
+ 'title': 'tech news',
+ },
+ 'playlist_mincount': 9,
}]
_PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
_API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
def _real_extract(self, url):
domain, display_id = re.match(self._VALID_URL, url).groups()
+ site = domain.split('.')[0]
page_info = self._download_json(
self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id)
- if page_info['data']['type'] == 'episode':
- episode_data = page_info['data']
- video_id = compat_str(episode_data['video']['data']['id'])
+ page_data = page_info['data']
+ page_type = page_data['type']
+ if page_type in ('episode', 'embed'):
+ show_data = page_data['show']['data']
+ page_id = compat_str(page_data['id'])
+ video_id = compat_str(page_data['video']['data']['id'])
+
+ preference = qualities(['mini', 'small', 'medium', 'large'])
+ thumbnails = [{
+ 'url': image_url,
+ 'id': image_id,
+ 'preference': preference(image_id)
+ } for image_id, image_url in page_data.get('images', {}).items()]
+
+ info = {
+ 'id': page_id,
+ 'display_id': display_id,
+ 'title': unescapeHTML(page_data['name']),
+ 'description': unescapeHTML(page_data.get('summary')),
+ 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '),
+ 'author': page_data.get('author'),
+ 'uploader': show_data.get('name'),
+ 'uploader_id': show_data.get('slug'),
+ 'thumbnails': thumbnails,
+ 'extractor_key': site,
+ }
+
+ if page_type == 'embed':
+ info.update({
+ '_type': 'url_transparent',
+ 'url': page_data['video']['data']['embed'],
+ })
+ return info
+
video_data = self._download_json(
'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
video_id)['items'][0]
@@ -84,36 +139,30 @@ class Revision3IE(InfoExtractor):
})
self._sort_formats(formats)
- preference = qualities(['mini', 'small', 'medium', 'large'])
- thumbnails = [{
- 'url': image_url,
- 'id': image_id,
- 'preference': preference(image_id)
- } for image_id, image_url in video_data.get('images', {}).items()]
-
- return {
- 'id': video_id,
- 'display_id': display_id,
+ info.update({
'title': unescapeHTML(video_data['title']),
'description': unescapeHTML(video_data.get('summary')),
- 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '),
- 'author': episode_data.get('author'),
'uploader': video_data.get('show', {}).get('name'),
'uploader_id': video_data.get('show', {}).get('slug'),
'duration': int_or_none(video_data.get('duration')),
- 'thumbnails': thumbnails,
'formats': formats,
- }
+ })
+ return info
else:
- show_data = page_info['show']['data']
+ list_data = page_info[page_type]['data']
episodes_data = page_info['episodes']['data']
num_episodes = page_info['meta']['totalEpisodes']
processed_episodes = 0
entries = []
page_num = 1
while True:
- entries.extend([self.url_result(
- 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data])
+ entries.extend([{
+ '_type': 'url',
+ 'url': 'http://%s%s' % (domain, episode['path']),
+ 'id': compat_str(episode['id']),
+ 'ie_key': 'Revision3',
+ 'extractor_key': site,
+ } for episode in episodes_data])
processed_episodes += len(episodes_data)
if processed_episodes == num_episodes:
break
@@ -123,5 +172,5 @@ class Revision3IE(InfoExtractor):
display_id)['episodes']['data']
return self.playlist_result(
- entries, compat_str(show_data['id']),
- show_data.get('name'), show_data.get('summary'))
+ entries, compat_str(list_data['id']),
+ list_data.get('name'), list_data.get('summary'))
diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py
new file mode 100644
index 000000000..f855719ac
--- /dev/null
+++ b/youtube_dl/extractor/rice.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import (
+ xpath_text,
+ xpath_element,
+ int_or_none,
+ parse_iso8601,
+ ExtractorError,
+)
+
+
+class RICEIE(InfoExtractor):
+ _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)'
+ _TEST = {
+ 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw',
+ 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a',
+ 'info_dict': {
+ 'id': 'YEWIvbhb40aqdjMD1ALSqw',
+ 'ext': 'mp4',
+ 'title': 'Active Learning in Archeology',
+ 'upload_date': '20140616',
+ 'timestamp': 1402926346,
+ }
+ }
+ _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config'
+
+ def _real_extract(self, url):
+ qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ portal_id = qs['PortalID'][0]
+ playlist_id = qs['DestinationID'][0]
+ content_id = qs['ContentID'][0]
+
+ content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={
+ 'portalId': portal_id,
+ 'playlistId': playlist_id,
+ 'contentId': content_id
+ })
+ metadata = xpath_element(content_data, './/metaData', fatal=True)
+ title = xpath_text(metadata, 'primaryTitle', fatal=True)
+ encodings = xpath_element(content_data, './/encodings', fatal=True)
+ player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={
+ 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True),
+ 'contentId': content_id,
+ })
+
+ common_fmt = {}
+ dimensions = xpath_text(encodings, 'dimensions')
+ if dimensions:
+ wh = dimensions.split('x')
+ if len(wh) == 2:
+ common_fmt.update({
+ 'width': int_or_none(wh[0]),
+ 'height': int_or_none(wh[1]),
+ })
+
+ formats = []
+ rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS))
+ if rtsp_path:
+ fmt = {
+ 'url': rtsp_path,
+ 'format_id': 'rtsp',
+ }
+ fmt.update(common_fmt)
+ formats.append(fmt)
+ for source in player_data.findall(self._xpath_ns('.//Source', self._NS)):
+ video_url = xpath_text(source, self._xpath_ns('File', self._NS))
+ if not video_url:
+ continue
+ if '.m3u8' in video_url:
+ formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ fmt = {
+ 'url': video_url,
+ 'format_id': video_url.split(':')[0],
+ }
+ fmt.update(common_fmt)
+ rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url)
+ if rtmp:
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ })
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for content_asset in content_data.findall('.//contentAssets'):
+ asset_type = xpath_text(content_asset, 'type')
+ if asset_type == 'image':
+ image_url = xpath_text(content_asset, 'httpPath')
+ if not image_url:
+ continue
+ thumbnails.append({
+ 'id': xpath_text(content_asset, 'ID'),
+ 'url': image_url,
+ })
+
+ return {
+ 'id': content_id,
+ 'title': title,
+ 'description': xpath_text(metadata, 'abstract'),
+ 'duration': int_or_none(xpath_text(metadata, 'duration')),
+ 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py
index efa4afeb6..508758075 100644
--- a/youtube_dl/extractor/ringtv.py
+++ b/youtube_dl/extractor/ringtv.py
@@ -8,13 +8,13 @@ from .common import InfoExtractor
class RingTVIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)'
_TEST = {
- "url": "http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30",
- "md5": "d25945f5df41cdca2d2587165ac28720",
- "info_dict": {
+ 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30',
+ 'md5': 'd25945f5df41cdca2d2587165ac28720',
+ 'info_dict': {
'id': '857645',
'ext': 'mp4',
- "title": 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV',
- "description": 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.',
+ 'title': 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV',
+ 'description': 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.',
}
}
@@ -32,8 +32,8 @@ class RingTVIE(InfoExtractor):
description = self._html_search_regex(
r'addthis:description="([^"]+)"',
webpage, 'description', fatal=False)
- final_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4" % video_id
- thumbnail_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg" % video_id
+ final_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4' % video_id
+ thumbnail_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg' % video_id
return {
'id': video_id,
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
index 73c9788be..042bc8dab 100644
--- a/youtube_dl/extractor/rte.py
+++ b/youtube_dl/extractor/rte.py
@@ -43,7 +43,7 @@ class RteIE(InfoExtractor):
r'<meta name="thumbnail" content="uri:irus:(.*?)" />', webpage, 'thumbnail')
thumbnail = 'http://img.rasset.ie/' + thumbnail_id + '.jpg'
- feeds_url = self._html_search_meta("feeds-prefix", webpage, 'feeds url') + video_id
+ feeds_url = self._html_search_meta('feeds-prefix', webpage, 'feeds url') + video_id
json_string = self._download_json(feeds_url, video_id)
# f4m_url = server + relative_url
diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py
index 25f7faf76..de004671d 100644
--- a/youtube_dl/extractor/rtl2.py
+++ b/youtube_dl/extractor/rtl2.py
@@ -63,7 +63,7 @@ class RTL2IE(InfoExtractor):
download_url = video_info['streamurl']
download_url = download_url.replace('\\', '')
stream_url = 'mp4:' + self._html_search_regex(r'ondemand/(.*)', download_url, 'stream URL')
- rtmp_conn = ["S:connect", "O:1", "NS:pageUrl:" + url, "NB:fpad:0", "NN:videoFunction:1", "O:0"]
+ rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0']
formats = [{
'url': download_url,
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 603d7bd00..8a8c5d2a0 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -10,6 +10,7 @@ from ..utils import (
ExtractorError,
float_or_none,
remove_end,
+ remove_start,
sanitized_Request,
std_headers,
struct_unpack,
@@ -178,14 +179,14 @@ class RTVEInfantilIE(InfoExtractor):
class RTVELiveIE(InfoExtractor):
IE_NAME = 'rtve.es:live'
IE_DESC = 'RTVE.es live streams'
- _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)'
+ _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
_TESTS = [{
- 'url': 'http://www.rtve.es/noticias/directo-la-1/',
+ 'url': 'http://www.rtve.es/directo/la-1/',
'info_dict': {
- 'id': 'directo-la-1',
- 'ext': 'flv',
- 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+ 'id': 'la-1',
+ 'ext': 'mp4',
+ 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
},
'params': {
'skip_download': 'live stream',
@@ -198,23 +199,20 @@ class RTVELiveIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- player_url = self._search_regex(
- r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL')
- title = remove_end(self._og_search_title(webpage), ' en directo')
+ title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
+ title = remove_start(title, 'Estoy viendo ')
title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
vidplayer_id = self._search_regex(
- r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+ r'playerId=player([0-9]+)', webpage, 'internal video ID')
+ png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id
png = self._download_webpage(png_url, video_id, 'Downloading url information')
- video_url = _decrypt_url(png)
+ m3u8_url = _decrypt_url(png)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
return {
'id': video_id,
- 'ext': 'flv',
'title': title,
- 'url': video_url,
- 'app': 'rtve-live-live?ovpfv=2.1.2',
- 'player_url': player_url,
- 'rtmp_live': True,
+ 'formats': formats,
+ 'is_live': True,
}
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
index 7de7b7273..256396bb8 100644
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@@ -4,14 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
from ..utils import (
ExtractorError,
sanitized_Request,
- smuggle_url,
std_headers,
urlencode_postdata,
+ update_url_query,
)
@@ -20,28 +19,30 @@ class SafariBaseIE(InfoExtractor):
_SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
_NETRC_MACHINE = 'safari'
- _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+ _API_BASE = 'https://www.safaribooksonline.com/api/v1'
_API_FORMAT = 'json'
LOGGED_IN = False
def _real_initialize(self):
- # We only need to log in once for courses or individual videos
- if not self.LOGGED_IN:
- self._login()
- SafariBaseIE.LOGGED_IN = True
+ self._login()
def _login(self):
+ # We only need to log in once for courses or individual videos
+ if self.LOGGED_IN:
+ return
+
(username, password) = self._get_login_info()
if username is None:
- self.raise_login_required('safaribooksonline.com account is required')
+ return
- headers = std_headers
+ headers = std_headers.copy()
if 'Referer' not in headers:
headers['Referer'] = self._LOGIN_URL
+ login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)
login_page = self._download_webpage(
- self._LOGIN_URL, None,
+ login_page_request, None,
'Downloading login form')
csrf = self._html_search_regex(
@@ -66,6 +67,8 @@ class SafariBaseIE(InfoExtractor):
'Login failed; make sure your credentials are correct and try again.',
expected=True)
+ SafariBaseIE.LOGGED_IN = True
+
self.to_screen('Login successful')
@@ -85,13 +88,15 @@ class SafariIE(SafariBaseIE):
_TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
- 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+ 'md5': 'dcc5a425e79f2564148652616af1f2a3',
'info_dict': {
- 'id': '2842601850001',
+ 'id': '0_qbqx90ic',
'ext': 'mp4',
- 'title': 'Introduction',
+ 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
+ 'timestamp': 1437758058,
+ 'upload_date': '20150724',
+ 'uploader_id': 'stork',
},
- 'skip': 'Requires safaribooksonline account credentials',
}, {
'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
'only_matching': True,
@@ -106,15 +111,30 @@ class SafariIE(SafariBaseIE):
course_id = mobj.group('course_id')
part = mobj.group('part')
- webpage = self._download_webpage(
- '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
- part)
+ webpage = self._download_webpage(url, '%s/%s' % (course_id, part))
+ reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id')
+ partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id')
+ ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id')
+
+ query = {
+ 'wid': '_%s' % partner_id,
+ 'uiconf_id': ui_id,
+ 'flashvars[referenceId]': reference_id,
+ }
- bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if not bc_url:
- raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+ if self.LOGGED_IN:
+ kaltura_session = self._download_json(
+ '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
+ course_id, 'Downloading kaltura session JSON',
+ 'Unable to download kaltura session JSON', fatal=False)
+ if kaltura_session:
+ session = kaltura_session.get('session')
+ if session:
+ query['flashvars[ks]'] = session
- return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
+ return self.url_result(update_url_query(
+ 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
+ 'Kaltura')
class SafariCourseIE(SafariBaseIE):
@@ -140,7 +160,7 @@ class SafariCourseIE(SafariBaseIE):
course_id = self._match_id(url)
course_json = self._download_json(
- '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+ '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
course_id, 'Downloading course JSON')
if 'chapters' not in course_json:
diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py
new file mode 100644
index 000000000..f2af15f6b
--- /dev/null
+++ b/youtube_dl/extractor/screenjunkies.py
@@ -0,0 +1,138 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ parse_age_limit,
+)
+
+
+class ScreenJunkiesIE(InfoExtractor):
+ _VALID_URL = r'http://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)'
+ _TESTS = [{
+ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915',
+ 'md5': '5c2b686bec3d43de42bde9ec047536b0',
+ 'info_dict': {
+ 'id': '2841915',
+ 'display_id': 'best-quentin-tarantino-movie',
+ 'ext': 'mp4',
+ 'title': 'Best Quentin Tarantino Movie',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 3671,
+ 'age_limit': 13,
+ 'tags': list,
+ },
+ }, {
+ 'url': 'http://www.screenjunkies.com/video/honest-trailers-the-dark-knight',
+ 'info_dict': {
+ 'id': '2348808',
+ 'display_id': 'honest-trailers-the-dark-knight',
+ 'ext': 'mp4',
+ 'title': "Honest Trailers: 'The Dark Knight'",
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'age_limit': 10,
+ 'tags': list,
+ },
+ }, {
+ # requires subscription but worked around
+ 'url': 'http://www.screenjunkies.com/video/knocking-dead-ep-1-the-show-so-far-3003285',
+ 'info_dict': {
+ 'id': '3003285',
+ 'display_id': 'knocking-dead-ep-1-the-show-so-far',
+ 'ext': 'mp4',
+ 'title': 'Knocking Dead Ep 1: State of The Dead Recap',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 3307,
+ 'age_limit': 13,
+ 'tags': list,
+ },
+ }]
+
+ _DEFAULT_BITRATES = (48, 150, 496, 864, 2240)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ if not video_id:
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(
+ (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'),
+ webpage, 'video id')
+
+ webpage = self._download_webpage(
+ 'http://www.screenjunkies.com/embed/%s' % video_id,
+ display_id, 'Downloading video embed page')
+ embed_vars = self._parse_json(
+ self._search_regex(
+ r'(?s)embedVars\s*=\s*({.+?})\s*</script>', webpage, 'embed vars'),
+ display_id)
+
+ title = embed_vars['contentName']
+
+ formats = []
+ bitrates = []
+ for f in embed_vars.get('media', []):
+ if not f.get('uri') or f.get('mediaPurpose') != 'play':
+ continue
+ bitrate = int_or_none(f.get('bitRate'))
+ if bitrate:
+ bitrates.append(bitrate)
+ formats.append({
+ 'url': f['uri'],
+ 'format_id': 'http-%d' % bitrate if bitrate else 'http',
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'tbr': bitrate,
+ 'format': 'mp4',
+ })
+
+ if not bitrates:
+ # When subscriptionLevel > 0, i.e. plus subscription is required
+ # media list will be empty. However, hds and hls uris are still
+ # available. We can grab them assuming bitrates to be default.
+ bitrates = self._DEFAULT_BITRATES
+
+ auth_token = embed_vars.get('AuthToken')
+
+ def construct_manifest_url(base_url, ext):
+ pieces = [base_url]
+ pieces.extend([compat_str(b) for b in bitrates])
+ pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token))
+ return ','.join(pieces)
+
+ if bitrates and auth_token:
+ hds_url = embed_vars.get('hdsUri')
+ if hds_url:
+ f4m_formats = self._extract_f4m_formats(
+ construct_manifest_url(hds_url, 'f4m'),
+ display_id, f4m_id='hds', fatal=False)
+ if len(f4m_formats) == len(bitrates):
+ for f, bitrate in zip(f4m_formats, bitrates):
+ if not f.get('tbr'):
+ f['format_id'] = 'hds-%d' % bitrate
+ f['tbr'] = bitrate
+ # TODO: fix f4m downloader to handle manifests without bitrates if possible
+ # formats.extend(f4m_formats)
+
+ hls_url = embed_vars.get('hlsUri')
+ if hls_url:
+ formats.extend(self._extract_m3u8_formats(
+ construct_manifest_url(hls_url, 'm3u8'),
+ display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': embed_vars.get('thumbUri'),
+ 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None,
+ 'age_limit': parse_age_limit(embed_vars.get('audienceRating')),
+ 'tags': embed_vars.get('tags', '').split(','),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
index e5d62a139..44b0bbee6 100644
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -40,7 +40,7 @@ class ScreenwaveMediaIE(InfoExtractor):
re.sub(
r'(?s)/\*.*?\*/', '',
self._search_regex(
- r"sources\s*:\s*(\[[^\]]+?\])", playerconfig,
+ r'sources\s*:\s*(\[[^\]]+?\])', playerconfig,
'sources',
).replace(
"' + thisObj.options.videoserver + '",
@@ -70,25 +70,27 @@ class ScreenwaveMediaIE(InfoExtractor):
formats = []
for source in sources:
- if source['type'] == 'hls':
- formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4'))
+ file_ = source.get('file')
+ if not file_:
+ continue
+ if source.get('type') == 'hls':
+ formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4'))
else:
- file_ = source.get('file')
- if not file_:
- continue
- format_label = source.get('label')
format_id = self._search_regex(
r'_(.+?)\.[^.]+$', file_, 'format id', default=None)
+ if not self._is_valid_url(file_, video_id, format_id or 'video'):
+ continue
+ format_label = source.get('label')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]', format_label, 'height', default=None))
formats.append({
- 'url': source['file'],
+ 'url': file_,
'format_id': format_id,
'format': format_label,
'ext': source.get('type'),
'height': height,
})
- self._sort_formats(formats)
+ self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index 990ea0fa8..4d3b58522 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -15,37 +15,37 @@ from ..compat import (
class SenateISVPIE(InfoExtractor):
_COMM_MAP = [
- ["ag", "76440", "http://ag-f.akamaihd.net"],
- ["aging", "76442", "http://aging-f.akamaihd.net"],
- ["approps", "76441", "http://approps-f.akamaihd.net"],
- ["armed", "76445", "http://armed-f.akamaihd.net"],
- ["banking", "76446", "http://banking-f.akamaihd.net"],
- ["budget", "76447", "http://budget-f.akamaihd.net"],
- ["cecc", "76486", "http://srs-f.akamaihd.net"],
- ["commerce", "80177", "http://commerce1-f.akamaihd.net"],
- ["csce", "75229", "http://srs-f.akamaihd.net"],
- ["dpc", "76590", "http://dpc-f.akamaihd.net"],
- ["energy", "76448", "http://energy-f.akamaihd.net"],
- ["epw", "76478", "http://epw-f.akamaihd.net"],
- ["ethics", "76449", "http://ethics-f.akamaihd.net"],
- ["finance", "76450", "http://finance-f.akamaihd.net"],
- ["foreign", "76451", "http://foreign-f.akamaihd.net"],
- ["govtaff", "76453", "http://govtaff-f.akamaihd.net"],
- ["help", "76452", "http://help-f.akamaihd.net"],
- ["indian", "76455", "http://indian-f.akamaihd.net"],
- ["intel", "76456", "http://intel-f.akamaihd.net"],
- ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"],
- ["jccic", "85180", "http://jccic-f.akamaihd.net"],
- ["jec", "76458", "http://jec-f.akamaihd.net"],
- ["judiciary", "76459", "http://judiciary-f.akamaihd.net"],
- ["rpc", "76591", "http://rpc-f.akamaihd.net"],
- ["rules", "76460", "http://rules-f.akamaihd.net"],
- ["saa", "76489", "http://srs-f.akamaihd.net"],
- ["smbiz", "76461", "http://smbiz-f.akamaihd.net"],
- ["srs", "75229", "http://srs-f.akamaihd.net"],
- ["uscc", "76487", "http://srs-f.akamaihd.net"],
- ["vetaff", "76462", "http://vetaff-f.akamaihd.net"],
- ["arch", "", "http://ussenate-f.akamaihd.net/"]
+ ['ag', '76440', 'http://ag-f.akamaihd.net'],
+ ['aging', '76442', 'http://aging-f.akamaihd.net'],
+ ['approps', '76441', 'http://approps-f.akamaihd.net'],
+ ['armed', '76445', 'http://armed-f.akamaihd.net'],
+ ['banking', '76446', 'http://banking-f.akamaihd.net'],
+ ['budget', '76447', 'http://budget-f.akamaihd.net'],
+ ['cecc', '76486', 'http://srs-f.akamaihd.net'],
+ ['commerce', '80177', 'http://commerce1-f.akamaihd.net'],
+ ['csce', '75229', 'http://srs-f.akamaihd.net'],
+ ['dpc', '76590', 'http://dpc-f.akamaihd.net'],
+ ['energy', '76448', 'http://energy-f.akamaihd.net'],
+ ['epw', '76478', 'http://epw-f.akamaihd.net'],
+ ['ethics', '76449', 'http://ethics-f.akamaihd.net'],
+ ['finance', '76450', 'http://finance-f.akamaihd.net'],
+ ['foreign', '76451', 'http://foreign-f.akamaihd.net'],
+ ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'],
+ ['help', '76452', 'http://help-f.akamaihd.net'],
+ ['indian', '76455', 'http://indian-f.akamaihd.net'],
+ ['intel', '76456', 'http://intel-f.akamaihd.net'],
+ ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'],
+ ['jccic', '85180', 'http://jccic-f.akamaihd.net'],
+ ['jec', '76458', 'http://jec-f.akamaihd.net'],
+ ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'],
+ ['rpc', '76591', 'http://rpc-f.akamaihd.net'],
+ ['rules', '76460', 'http://rules-f.akamaihd.net'],
+ ['saa', '76489', 'http://srs-f.akamaihd.net'],
+ ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'],
+ ['srs', '75229', 'http://srs-f.akamaihd.net'],
+ ['uscc', '76487', 'http://srs-f.akamaihd.net'],
+ ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
+ ['arch', '', 'http://ussenate-f.akamaihd.net/']
]
_IE_NAME = 'senate.gov'
_VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py
index 6365a8779..a99b2a8e7 100644
--- a/youtube_dl/extractor/sexu.py
+++ b/youtube_dl/extractor/sexu.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -14,7 +12,7 @@ class SexuIE(InfoExtractor):
'id': '961791',
'ext': 'mp4',
'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
- 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54',
+ 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403',
'categories': list, # NSFW
'thumbnail': 're:https?://.*\.jpg$',
'age_limit': 18,
@@ -25,13 +23,18 @@ class SexuIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- quality_arr = self._search_regex(
- r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string')
+ jwvideo = self._parse_json(
+ self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'),
+ video_id)
+
+ sources = jwvideo['sources']
+
formats = [{
- 'url': fmt[0].replace('\\', ''),
- 'format_id': fmt[1],
- 'height': int(fmt[1][:3]),
- } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)]
+ 'url': source['file'].replace('\\', ''),
+ 'format_id': source.get('label'),
+ 'height': self._search_regex(
+ r'^(\d+)[pP]', source.get('label', ''), 'height', default=None),
+ } for source in sources if source.get('file')]
self._sort_formats(formats)
title = self._html_search_regex(
@@ -40,9 +43,7 @@ class SexuIE(InfoExtractor):
description = self._html_search_meta(
'description', webpage, 'description')
- thumbnail = self._html_search_regex(
- r'image:\s*"([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ thumbnail = jwvideo.get('image')
categories_str = self._html_search_meta(
'keywords', webpage, 'categories')
diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py
index 3df71304d..7efb29f65 100644
--- a/youtube_dl/extractor/slutload.py
+++ b/youtube_dl/extractor/slutload.py
@@ -13,8 +13,8 @@ class SlutloadIE(InfoExtractor):
'info_dict': {
'id': 'TD73btpBqSxc',
'ext': 'mp4',
- "title": "virginie baisee en cam",
- "age_limit": 18,
+ 'title': 'virginie baisee en cam',
+ 'age_limit': 18,
'thumbnail': 're:https?://.*?\.jpg'
}
}
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 30210c8a3..015ef75f3 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -170,7 +170,7 @@ class SmotriIE(InfoExtractor):
'getvideoinfo': '1',
}
- video_password = self._downloader.params.get('videopassword', None)
+ video_password = self._downloader.params.get('videopassword')
if video_password:
video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
@@ -356,7 +356,7 @@ class SmotriBroadcastIE(InfoExtractor):
url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
- broadcast_password = self._downloader.params.get('videopassword', None)
+ broadcast_password = self._downloader.params.get('videopassword')
if broadcast_password:
url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py
index da3b05a8d..0d1ab07f8 100644
--- a/youtube_dl/extractor/snotr.py
+++ b/youtube_dl/extractor/snotr.py
@@ -43,7 +43,7 @@ class SnotrIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
- video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id
+ video_url = 'http://cdn.videos.snotr.com/%s.flv' % video_id
view_count = str_to_int(self._html_search_regex(
r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>',
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index b2d5487ca..1efb2b980 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -222,7 +222,7 @@ class SoundcloudIE(InfoExtractor):
full_title = track_id
token = mobj.group('secret_token')
if token:
- info_json_url += "&secret_token=" + token
+ info_json_url += '&secret_token=' + token
elif mobj.group('player'):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
real_url = query['url'][0]
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
deleted file mode 100644
index ebb5d6ec0..000000000
--- a/youtube_dl/extractor/space.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
-from ..utils import RegexNotFoundError, ExtractorError
-
-
-class SpaceIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
- _TEST = {
- 'add_ie': ['BrightcoveLegacy'],
- 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
- 'info_dict': {
- 'id': '2780937028001',
- 'ext': 'mp4',
- 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
- 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
- 'uploader': 'TechMedia Networks',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- try:
- # Some videos require the playerKey field, which isn't define in
- # the BrightcoveExperience object
- brightcove_url = self._og_search_video_url(webpage)
- except RegexNotFoundError:
- # Other videos works fine with the info from the object
- brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if brightcove_url is None:
- raise ExtractorError(
- 'The webpage does not contain a video', expected=True)
- return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key())
diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py
index 183dcb03c..1a831ef6d 100644
--- a/youtube_dl/extractor/steam.py
+++ b/youtube_dl/extractor/steam.py
@@ -22,23 +22,23 @@ class SteamIE(InfoExtractor):
_VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
_AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
_TESTS = [{
- "url": "http://store.steampowered.com/video/105600/",
- "playlist": [
+ 'url': 'http://store.steampowered.com/video/105600/',
+ 'playlist': [
{
- "md5": "f870007cee7065d7c76b88f0a45ecc07",
- "info_dict": {
+ 'md5': 'f870007cee7065d7c76b88f0a45ecc07',
+ 'info_dict': {
'id': '81300',
'ext': 'flv',
- "title": "Terraria 1.1 Trailer",
+ 'title': 'Terraria 1.1 Trailer',
'playlist_index': 1,
}
},
{
- "md5": "61aaf31a5c5c3041afb58fb83cbb5751",
- "info_dict": {
+ 'md5': '61aaf31a5c5c3041afb58fb83cbb5751',
+ 'info_dict': {
'id': '80859',
'ext': 'flv',
- "title": "Terraria Trailer",
+ 'title': 'Terraria Trailer',
'playlist_index': 2,
}
}
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index a48d77c30..cf8851438 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -73,7 +73,7 @@ class TEDIE(InfoExtractor):
'add_ie': ['Youtube'],
'info_dict': {
'id': '_ZG8HBuDjgc',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Douglas Adams: Parrots the Universe and Everything',
'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
'uploader': 'University of California Television (UCTV)',
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
index f6694149b..02a31a609 100644
--- a/youtube_dl/extractor/tenplay.py
+++ b/youtube_dl/extractor/tenplay.py
@@ -27,10 +27,10 @@ class TenPlayIE(InfoExtractor):
}
_video_fields = [
- "id", "name", "shortDescription", "longDescription", "creationDate",
- "publishedDate", "lastModifiedDate", "customFields", "videoStillURL",
- "thumbnailURL", "referenceId", "length", "playsTotal",
- "playsTrailingWeek", "renditions", "captioning", "startDate", "endDate"]
+ 'id', 'name', 'shortDescription', 'longDescription', 'creationDate',
+ 'publishedDate', 'lastModifiedDate', 'customFields', 'videoStillURL',
+ 'thumbnailURL', 'referenceId', 'length', 'playsTotal',
+ 'playsTrailingWeek', 'renditions', 'captioning', 'startDate', 'endDate']
def _real_extract(self, url):
webpage = self._download_webpage(url, url)
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 6890021cf..9ee844684 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -48,8 +48,6 @@ class TF1IE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
wat_id = self._html_search_regex(
- r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
+ r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1',
webpage, 'wat id', group='id')
- wat_info = self._download_json(
- 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
- return self.url_result(wat_info['media']['url'], 'Wat')
+ return self.url_result('wat:%s' % wat_id, 'Wat')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 10f2cad55..9a57b49df 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -20,8 +20,9 @@ from ..utils import (
int_or_none,
sanitized_Request,
unsmuggle_url,
- url_basename,
xpath_with_ns,
+ mimetype2ext,
+ find_xpath_attr,
)
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -31,15 +32,11 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
class ThePlatformBaseIE(InfoExtractor):
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
meta = self._download_xml(smil_url, video_id, note=note)
- try:
- error_msg = next(
- n.attrib['abstract']
- for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
- except StopIteration:
- pass
- else:
- raise ExtractorError(error_msg, expected=True)
+ error_element = find_xpath_attr(
+ meta, _x('.//smil:ref'), 'src',
+ 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4')
+ if error_element is not None:
+ raise ExtractorError(error_element.attrib['abstract'], expected=True)
formats = self._parse_smil_formats(
meta, smil_url, video_id, namespace=default_ns,
@@ -69,7 +66,7 @@ class ThePlatformBaseIE(InfoExtractor):
for caption in captions:
lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
subtitles[lang] = [{
- 'ext': 'srt' if mime == 'text/srt' else 'ttml',
+ 'ext': mimetype2ext(mime),
'url': src,
}]
@@ -283,8 +280,8 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
first_video_id = None
duration = None
for item in entry['media$content']:
- smil_url = item['plfile$url'] + '&format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M'
- cur_video_id = url_basename(smil_url)
+ smil_url = item['plfile$url'] + '&format=SMIL&mbr=true'
+ cur_video_id = ThePlatformIE._match_id(smil_url)
if first_video_id is None:
first_video_id = cur_video_id
duration = float_or_none(item.get('plfile$duration'))
diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py
index 5d09eb9a8..d8b1fd281 100644
--- a/youtube_dl/extractor/thesixtyone.py
+++ b/youtube_dl/extractor/thesixtyone.py
@@ -48,22 +48,22 @@ class TheSixtyOneIE(InfoExtractor):
]
_DECODE_MAP = {
- "x": "a",
- "m": "b",
- "w": "c",
- "q": "d",
- "n": "e",
- "p": "f",
- "a": "0",
- "h": "1",
- "e": "2",
- "u": "3",
- "s": "4",
- "i": "5",
- "o": "6",
- "y": "7",
- "r": "8",
- "c": "9"
+ 'x': 'a',
+ 'm': 'b',
+ 'w': 'c',
+ 'q': 'd',
+ 'n': 'e',
+ 'p': 'f',
+ 'a': '0',
+ 'h': '1',
+ 'e': '2',
+ 'u': '3',
+ 's': '4',
+ 'i': '5',
+ 'o': '6',
+ 'y': '7',
+ 'r': '8',
+ 'c': '9'
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index adc05ed5f..17add9543 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -4,12 +4,12 @@ import re
from .common import InfoExtractor
from .brightcove import BrightcoveLegacyIE
-from ..compat import compat_urlparse
+from ..compat import compat_parse_qs
class TlcDeIE(InfoExtractor):
IE_NAME = 'tlc.de'
- _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)'
+ _VALID_URL = r'http://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?'
_TEST = {
'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
@@ -17,32 +17,23 @@ class TlcDeIE(InfoExtractor):
'id': '3235167922001',
'ext': 'mp4',
'title': 'Breaking Amish: Die Welt da draußen',
- 'uploader': 'Discovery Networks - Germany',
'description': (
'Vier Amische und eine Mennonitin wagen in New York'
' den Sprung in ein komplett anderes Leben. Begleitet sie auf'
' ihrem spannenden Weg.'),
+ 'timestamp': 1396598084,
+ 'upload_date': '20140404',
+ 'uploader_id': '1659832546',
},
}
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- iframe_url = self._search_regex(
- '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage,
- 'iframe url')
- # Otherwise we don't get the correct 'BrightcoveExperience' element,
- # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/
- iframe_url = iframe_url.replace('.htm?', '.php?')
- url_fragment = compat_urlparse.urlparse(url).fragment
- if url_fragment:
- # Since the fragment is not send to the server, we always get the same iframe
- iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url)
- iframe = self._download_webpage(iframe_url, title)
-
- return {
- '_type': 'url',
- 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe),
- 'ie': BrightcoveLegacyIE.ie_key(),
- }
+ brightcove_id = mobj.group('id')
+ if not brightcove_id:
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
+ brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0]
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index 49516abca..79f036fe4 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -71,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- display_id = mobj.group('display_id')
+ display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id
webpage = self._download_webpage(url, display_id)
@@ -117,7 +117,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
title = self._html_search_regex(
self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
- age_limit = self._rta_search(webpage)
+ age_limit = self._rta_search(webpage) or 18
duration = parse_duration(self._html_search_meta(
'duration', webpage, 'duration', default=None))
@@ -152,6 +152,36 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
}
+class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)'
+
+ _TITLE_REGEX = r'<title>([^<]+)</title>'
+
+ _TESTS = [{
+ 'url': 'https://player.tnaflix.com/video/6538',
+ 'info_dict': {
+ 'id': '6538',
+ 'display_id': '6538',
+ 'ext': 'mp4',
+ 'title': 'Educational xxx video',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://player.empflix.com/video/33051',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1',
+ webpage)]
+
+
class TNAFlixIE(TNAFlixNetworkBaseIE):
_VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py
index 1c53a3fd0..0e01b15fc 100644
--- a/youtube_dl/extractor/traileraddict.py
+++ b/youtube_dl/extractor/traileraddict.py
@@ -38,12 +38,12 @@ class TrailerAddictIE(InfoExtractor):
# Presence of (no)watchplus function indicates HD quality is available
if re.search(r'function (no)?watchplus()', webpage):
- fvar = "fvarhd"
+ fvar = 'fvarhd'
else:
- fvar = "fvar"
+ fvar = 'fvar'
- info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id))
- info_webpage = self._download_webpage(info_url, video_id, "Downloading the info webpage")
+ info_url = 'http://www.traileraddict.com/%s.php?tid=%s' % (fvar, str(video_id))
+ info_webpage = self._download_webpage(info_url, video_id, 'Downloading the info webpage')
final_url = self._search_regex(r'&fileurl=(.+)',
info_webpage, 'Download url').replace('%3F', '?')
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index da3cd76f7..f56b66d06 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -49,7 +49,7 @@ class TudouIE(InfoExtractor):
info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
if quality:
info_url += '&hd' + quality
- xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
+ xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
final_url = xml_data.text
return final_url
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 69882da63..d4169ec6d 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -17,6 +17,7 @@ from ..utils import (
encode_dict,
ExtractorError,
int_or_none,
+ orderedSet,
parse_duration,
parse_iso8601,
sanitized_Request,
@@ -251,6 +252,7 @@ class TwitchVodIE(TwitchItemBaseIE):
self._USHER_BASE, item_id,
compat_urllib_parse.urlencode({
'allow_source': 'true',
+ 'allow_audio_only': 'true',
'allow_spectre': 'true',
'player': 'twitchweb',
'nauth': access_token['token'],
@@ -281,17 +283,37 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):
entries = []
offset = 0
limit = self._PAGE_LIMIT
+ broken_paging_detected = False
+ counter_override = None
for counter in itertools.count(1):
response = self._download_json(
self._PLAYLIST_URL % (channel_id, offset, limit),
- channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
+ channel_id,
+ 'Downloading %s videos JSON page %s'
+ % (self._PLAYLIST_TYPE, counter_override or counter))
page_entries = self._extract_playlist_page(response)
if not page_entries:
break
+ total = int_or_none(response.get('_total'))
+ # Since the beginning of March 2016 twitch's paging mechanism
+ # is completely broken on the twitch side. It simply ignores
+ # a limit and returns the whole offset number of videos.
+ # Working around by just requesting all videos at once.
+ # Upd: pagination bug was fixed by twitch on 15.03.2016.
+ if not broken_paging_detected and total and len(page_entries) > limit:
+ self.report_warning(
+ 'Twitch pagination is broken on twitch side, requesting all videos at once',
+ channel_id)
+ broken_paging_detected = True
+ offset = total
+ counter_override = '(all at once)'
+ continue
entries.extend(page_entries)
+ if broken_paging_detected or total and len(page_entries) >= total:
+ break
offset += limit
return self.playlist_result(
- [self.url_result(entry) for entry in set(entries)],
+ [self.url_result(entry) for entry in orderedSet(entries)],
channel_id, channel_name)
def _extract_playlist_page(self, response):
@@ -411,6 +433,7 @@ class TwitchStreamIE(TwitchBaseIE):
query = {
'allow_source': 'true',
+ 'allow_audio_only': 'true',
'p': random.randint(1000000, 10000000),
'player': 'twitchweb',
'segment_preference': '4',
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index a161f046b..e70b2ab3c 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -10,21 +10,26 @@ from ..utils import (
remove_end,
int_or_none,
ExtractorError,
- sanitized_Request,
)
-class TwitterCardIE(InfoExtractor):
+class TwitterBaseIE(InfoExtractor):
+ def _get_vmap_video_url(self, vmap_url, video_id):
+ vmap_data = self._download_xml(vmap_url, video_id)
+ return xpath_text(vmap_data, './/MediaFile').strip()
+
+
+class TwitterCardIE(TwitterBaseIE):
IE_NAME = 'twitter:card'
- _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
- 'md5': '4fa26a35f9d1bf4b646590ba8e84be19',
+ # MD5 checksums are different in different places
'info_dict': {
'id': '560070183650213889',
'ext': 'mp4',
- 'title': 'TwitterCard',
+ 'title': 'Twitter Card',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 30.033,
}
@@ -35,14 +40,14 @@ class TwitterCardIE(InfoExtractor):
'info_dict': {
'id': '623160978427936768',
'ext': 'mp4',
- 'title': 'TwitterCard',
+ 'title': 'Twitter Card',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 80.155,
},
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
- 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+ 'md5': 'd4724ffe6d2437886d004fa5de1043b3',
'info_dict': {
'id': 'dq4Oj5quskI',
'ext': 'mp4',
@@ -62,69 +67,106 @@ class TwitterCardIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20151113',
'uploader_id': '1189339351084113920',
- 'uploader': '@ArsenalTerje',
- 'title': 'Vine by @ArsenalTerje',
+ 'uploader': 'ArsenalTerje',
+ 'title': 'Vine by ArsenalTerje',
},
'add_ie': ['Vine'],
- }
+ }, {
+ 'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
+ 'md5': '3846d0a07109b5ab622425449b59049d',
+ 'info_dict': {
+ 'id': '705235433198714880',
+ 'ext': 'mp4',
+ 'title': 'Twitter web player',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ },
]
def _real_extract(self, url):
video_id = self._match_id(url)
- # Different formats served for different User-Agents
- USER_AGENTS = [
- 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4
- 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm
- ]
-
config = None
formats = []
- for user_agent in USER_AGENTS:
- request = sanitized_Request(url)
- request.add_header('User-Agent', user_agent)
- webpage = self._download_webpage(request, video_id)
-
- iframe_url = self._html_search_regex(
- r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
- webpage, 'video iframe', default=None)
- if iframe_url:
- return self.url_result(iframe_url)
-
- config = self._parse_json(self._html_search_regex(
- r'data-player-config="([^"]+)"', webpage, 'data player config'),
- video_id)
- if 'playlist' not in config:
- if 'vmapUrl' in config:
- vmap_data = self._download_xml(config['vmapUrl'], video_id)
- video_url = xpath_text(vmap_data, './/MediaFile').strip()
- formats.append({
- 'url': video_url,
- })
- break # same video regardless of UA
- continue
-
- video_url = config['playlist'][0]['source']
+ duration = None
- f = {
- 'url': video_url,
- }
+ webpage = self._download_webpage(url, video_id)
+
+ iframe_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
+ webpage, 'video iframe', default=None)
+ if iframe_url:
+ return self.url_result(iframe_url)
+
+ config = self._parse_json(self._html_search_regex(
+ r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'),
+ video_id)
+ def _search_dimensions_in_video_url(a_format, video_url):
m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
if m:
- f.update({
+ a_format.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
+
+ playlist = config.get('playlist')
+ if playlist:
+ video_url = playlist[0]['source']
+
+ f = {
+ 'url': video_url,
+ }
+
+ _search_dimensions_in_video_url(f, video_url)
+
formats.append(f)
+
+ vmap_url = config.get('vmapUrl') or config.get('vmap_url')
+ if vmap_url:
+ formats.append({
+ 'url': self._get_vmap_video_url(vmap_url, video_id),
+ })
+
+ media_info = None
+
+ for entity in config.get('status', {}).get('entities', []):
+ if 'mediaInfo' in entity:
+ media_info = entity['mediaInfo']
+
+ if media_info:
+ for media_variant in media_info['variants']:
+ media_url = media_variant['url']
+ if media_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
+ elif media_url.endswith('.mpd'):
+ formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
+ else:
+ vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
+ a_format = {
+ 'url': media_url,
+ 'format_id': 'http-%d' % vbr if vbr else 'http',
+ 'vbr': vbr,
+ }
+ # Reported bitRate may be zero
+ if not a_format['vbr']:
+ del a_format['vbr']
+
+ _search_dimensions_in_video_url(a_format, media_url)
+
+ formats.append(a_format)
+
+ duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
+
self._sort_formats(formats)
- thumbnail = config.get('posterImageUrl')
- duration = float_or_none(config.get('duration'))
+ title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ thumbnail = config.get('posterImageUrl') or config.get('image_src')
+ duration = float_or_none(config.get('duration')) or duration
return {
'id': video_id,
- 'title': 'TwitterCard',
+ 'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
@@ -138,7 +180,6 @@ class TwitterIE(InfoExtractor):
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
- 'md5': 'db6612ec5d03355953c3ca9250c97e5e',
'info_dict': {
'id': '643211948184596480',
'ext': 'mp4',
@@ -149,6 +190,9 @@ class TwitterIE(InfoExtractor):
'uploader': 'FREE THE NIPPLE',
'uploader_id': 'freethenipple',
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
@@ -161,6 +205,7 @@ class TwitterIE(InfoExtractor):
'uploader': 'Gifs',
'uploader_id': 'giphz',
},
+ 'expected_warnings': ['height', 'width'],
}, {
'url': 'https://twitter.com/starwars/status/665052190608723968',
'md5': '39b7199856dee6cd4432e72c74bc69d4',
@@ -172,6 +217,36 @@ class TwitterIE(InfoExtractor):
'uploader_id': 'starwars',
'uploader': 'Star Wars',
},
+ }, {
+ 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
+ 'info_dict': {
+ 'id': '705235433198714880',
+ 'ext': 'mp4',
+ 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.',
+ 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."',
+ 'uploader_id': 'BTNBrentYarina',
+ 'uploader': 'Brent Yarina',
+ },
+ 'params': {
+ # The same video as https://twitter.com/i/videos/tweet/705235433198714880
+ # Test case of TwitterCardIE
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '700207533655363584',
+ 'ext': 'mp4',
+ 'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel',
+ 'description': 'jay on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'jay',
+ 'uploader_id': 'jaydingeer',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}]
def _real_extract(self, url):
@@ -208,21 +283,91 @@ class TwitterIE(InfoExtractor):
return info
mobj = re.search(r'''(?x)
- <video[^>]+class="animated-gif"[^>]+
- (?:data-height="(?P<height>\d+)")?[^>]+
- (?:data-width="(?P<width>\d+)")?[^>]+
- (?:poster="(?P<poster>[^"]+)")?[^>]*>\s*
+ <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
<source[^>]+video-src="(?P<url>[^"]+)"
''', webpage)
if mobj:
+ more_info = mobj.group('more_info')
+ height = int_or_none(self._search_regex(
+ r'data-height="(\d+)"', more_info, 'height', fatal=False))
+ width = int_or_none(self._search_regex(
+ r'data-width="(\d+)"', more_info, 'width', fatal=False))
+ thumbnail = self._search_regex(
+ r'poster="([^"]+)"', more_info, 'poster', fatal=False)
info.update({
'id': twid,
'url': mobj.group('url'),
- 'height': int_or_none(mobj.group('height')),
- 'width': int_or_none(mobj.group('width')),
- 'thumbnail': mobj.group('poster'),
+ 'height': height,
+ 'width': width,
+ 'thumbnail': thumbnail,
})
return info
- raise ExtractorError('There\'s not video in this tweet.')
+ if 'class="PlayableMedia' in webpage:
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': 'TwitterCard',
+ 'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid),
+ })
+
+ return info
+
+ raise ExtractorError('There\'s no video in this tweet.')
+
+
+class TwitterAmplifyIE(TwitterBaseIE):
+ IE_NAME = 'twitter:amplify'
+ _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})'
+
+ _TEST = {
+ 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+ 'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
+ 'info_dict': {
+ 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+ 'ext': 'mp4',
+ 'title': 'Twitter Video',
+ 'thumbnail': 're:^https?://.*',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ vmap_url = self._html_search_meta(
+ 'twitter:amplify:vmap', webpage, 'vmap url')
+ video_url = self._get_vmap_video_url(vmap_url, video_id)
+
+ thumbnails = []
+ thumbnail = self._html_search_meta(
+ 'twitter:image:src', webpage, 'thumbnail', fatal=False)
+
+ def _find_dimension(target):
+ w = int_or_none(self._html_search_meta(
+ 'twitter:%s:width' % target, webpage, fatal=False))
+ h = int_or_none(self._html_search_meta(
+ 'twitter:%s:height' % target, webpage, fatal=False))
+ return w, h
+
+ if thumbnail:
+ thumbnail_w, thumbnail_h = _find_dimension('image')
+ thumbnails.append({
+ 'url': thumbnail,
+ 'width': thumbnail_w,
+ 'height': thumbnail_h,
+ })
+
+ video_w, video_h = _find_dimension('player')
+ formats = [{
+ 'url': video_url,
+ 'width': video_w,
+ 'height': video_h,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': 'Twitter Video',
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py
new file mode 100644
index 000000000..e5678dc78
--- /dev/null
+++ b/youtube_dl/extractor/usatoday.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ get_element_by_attribute,
+ parse_duration,
+ update_url_query,
+ ExtractorError,
+)
+from ..compat import compat_str
+
+
+class USATodayIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)'
+ _TEST = {
+ 'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/',
+ 'md5': '4d40974481fa3475f8bccfd20c5361f8',
+ 'info_dict': {
+ 'id': '81729424',
+ 'ext': 'mp4',
+ 'title': 'US, France warn Syrian regime ahead of new peace talks',
+ 'timestamp': 1457891045,
+ 'description': 'md5:7e50464fdf2126b0f533748d3c78d58f',
+ 'uploader_id': '29906170001',
+ 'upload_date': '20160313',
+ }
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(update_url_query(url, {'ajax': 'true'}), display_id)
+ ui_video_data = get_element_by_attribute('class', 'ui-video-data', webpage)
+ if not ui_video_data:
+ raise ExtractorError('no video on the webpage', expected=True)
+ video_data = self._parse_json(ui_video_data, display_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'],
+ 'id': compat_str(video_data['id']),
+ 'title': video_data['title'],
+ 'thumbnail': video_data.get('thumbnail'),
+ 'description': video_data.get('description'),
+ 'duration': parse_duration(video_data.get('length')),
+ 'ie_key': 'BrightcoveNew',
+ }
diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py
new file mode 100644
index 000000000..cafc082b6
--- /dev/null
+++ b/youtube_dl/extractor/ustudio.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
+
+
+class UstudioIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
+ 'md5': '58bbfca62125378742df01fc2abbdef6',
+ 'info_dict': {
+ 'id': 'Uxu2my9bgSph',
+ 'display_id': 'san_francisco_golden_gate_bridge',
+ 'ext': 'mp4',
+ 'title': 'San Francisco: Golden Gate Bridge',
+ 'description': 'md5:23925500697f2c6d4830e387ba51a9be',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20111107',
+ 'uploader': 'Tony Farley',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ config = self._download_xml(
+ 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
+ display_id)
+
+ def extract(kind):
+ return [{
+ 'url': item.attrib['url'],
+ 'width': int_or_none(item.get('width')),
+ 'height': int_or_none(item.get('height')),
+ } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
+
+ formats = extract('video')
+ self._sort_formats(formats)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+ upload_date = unified_strdate(self._search_regex(
+ r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>',
+ webpage, 'upload date', fatal=False))
+ uploader = self._search_regex(
+ r'Uploaded by\s*<a[^>]*>([^<]+)<',
+ webpage, 'uploader', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnails': extract('image'),
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index 1e740fbe6..3794bcded 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -47,7 +47,7 @@ class Vbox7IE(InfoExtractor):
title = self._html_search_regex(r'<title>(.*)</title>',
webpage, 'title').split('/')[0].strip()
- info_url = "http://vbox7.com/play/magare.do"
+ info_url = 'http://vbox7.com/play/magare.do'
data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id})
info_request = sanitized_Request(info_url, data)
info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 14e945d49..e148b1ef5 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -20,6 +20,7 @@ class VGTVIE(XstreamIE):
'aftenbladet.no/tv': 'satv',
'fvn.no/fvntv': 'fvntv',
'aftenposten.no/webtv': 'aptv',
+ 'ap.vgtv.no/webtv': 'aptv',
}
_APP_NAME_TO_VENDOR = {
@@ -35,7 +36,7 @@ class VGTVIE(XstreamIE):
(?P<host>
%s
)
- /
+ /?
(?:
\#!/(?:video|live)/|
embed?.*id=
@@ -107,19 +108,27 @@ class VGTVIE(XstreamIE):
'md5': 'fd828cd29774a729bf4d4425fe192972',
'info_dict': {
'id': '21039',
- 'ext': 'mov',
+ 'ext': 'mp4',
'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',
'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',
'duration': 66,
'timestamp': 1417002452,
'upload_date': '20141126',
'view_count': int,
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
'only_matching': True,
},
+ {
+ 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -144,8 +153,6 @@ class VGTVIE(XstreamIE):
if len(video_id) == 5:
if appname == 'bttv':
info = self._extract_video_info('btno', video_id)
- elif appname == 'aptv':
- info = self._extract_video_info('ap', video_id)
streams = data['streamUrls']
stream_type = data.get('streamType')
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 3db6286e4..46c785ae1 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -1,31 +1,37 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..utils import ExtractorError
class ViceIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
-
- _TESTS = [
- {
- 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- 'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
- 'ext': 'mp4',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- 'duration': 725.983,
- },
- 'params': {
- # Requires ffmpeg (m3u8 manifest)
- 'skip_download': True,
- },
- }, {
- 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
- 'only_matching': True,
- }
- ]
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ 'duration': 725.983,
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -38,3 +44,35 @@ class ViceIE(InfoExtractor):
except ExtractorError:
raise ExtractorError('The page doesn\'t contain a video', expected=True)
return self.url_result(ooyala_url, ie='Ooyala')
+
+
+class ViceShowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
+
+ _TEST = {
+ 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2',
+ 'info_dict': {
+ 'id': 'fuck-thats-delicious-2',
+ 'title': "Fuck, That's Delicious",
+ 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.',
+ },
+ 'playlist_count': 17,
+ }
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
+
+ entries = [
+ self.url_result(video_url, ViceIE.ie_key())
+ for video_url, _ in re.findall(
+ r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
+ % ViceIE._VALID_URL, webpage)]
+
+ title = self._search_regex(
+ r'<title>(.+?)</title>', webpage, 'title', default=None)
+ if title:
+ title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
+ description = self._html_search_meta('description', webpage, 'description')
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 5e2e7cbac..4f0dcd18c 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import sanitized_Request
+from ..utils import (
+ decode_packed_codes,
+ sanitized_Request,
+)
class VideoMegaIE(InfoExtractor):
- _WORKING = False
_VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
@@ -42,8 +44,10 @@ class VideoMegaIE(InfoExtractor):
r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
thumbnail = self._search_regex(
r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+ real_codes = decode_packed_codes(webpage)
video_url = self._search_regex(
- r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
+ r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
index 3176e3b9d..5de8273c3 100644
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -26,7 +26,7 @@ class VideoPremiumIE(InfoExtractor):
webpage_url = 'http://videopremium.tv/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
- if re.match(r"^<html><head><script[^>]*>window.location\s*=", webpage):
+ if re.match(r'^<html><head><script[^>]*>window.location\s*=', webpage):
# Download again, we need a cookie
webpage = self._download_webpage(
webpage_url, video_id,
@@ -37,10 +37,10 @@ class VideoPremiumIE(InfoExtractor):
return {
'id': video_id,
- 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
- 'play_path': "mp4:%s.f4v" % video_id,
- 'page_url': "http://videopremium.tv/" + video_id,
- 'player_url': "http://videopremium.tv/uplayer/uppod.swf",
+ 'url': 'rtmp://e%d.md.iplay.md/play' % random.randint(1, 16),
+ 'play_path': 'mp4:%s.f4v' % video_id,
+ 'page_url': 'http://videopremium.tv/' + video_id,
+ 'player_url': 'http://videopremium.tv/uplayer/uppod.swf',
'ext': 'f4v',
'title': video_title,
}
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index 7c6e98026..3c78fb3d5 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -1,11 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import smuggle_url
+from .jwplatform import JWPlatformBaseIE
+from ..utils import (
+ decode_packed_codes,
+ js_to_json,
+)
-class VidziIE(InfoExtractor):
+class VidziIE(JWPlatformBaseIE):
_VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
_TEST = {
'url': 'http://vidzi.tv/cghql9yq6emu.html',
@@ -14,7 +17,6 @@ class VidziIE(InfoExtractor):
'id': 'cghql9yq6emu',
'ext': 'mp4',
'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
- 'uploader': 'vidzi.tv',
},
'params': {
# m3u8 download
@@ -29,11 +31,12 @@ class VidziIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
- # Vidzi now uses jwplayer, which can be handled by GenericIE
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': title,
- 'url': smuggle_url(url, {'to_generic': True}),
- 'ie_key': 'Generic',
- }
+ code = decode_packed_codes(webpage).replace('\\\'', '\'')
+ jwplayer_data = self._parse_json(
+ self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'),
+ video_id, transform_source=js_to_json)
+
+ info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+ info_dict['title'] = title
+
+ return info_dict
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 433fc9914..e04b814c8 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -176,13 +176,13 @@ class VikiIE(VikiBaseIE):
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
- 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+ 'md5': '63f8600c1da6f01b7640eee7eca4f1da',
'info_dict': {
'id': '50562v',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Poor Nastya [COMPLETE] - Episode 1',
'description': '',
- 'duration': 607,
+ 'duration': 606,
'timestamp': 1274949505,
'upload_date': '20101213',
'uploader': 'ad14065n',
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 2389e7f0f..71c30d2cd 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -57,7 +57,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
def _extract_xsrft_and_vuid(self, webpage):
xsrft = self._search_regex(
- r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
+ r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
webpage, 'login token', group='xsrft')
vuid = self._search_regex(
r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1',
@@ -73,15 +73,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
- https?://
- (?:(?:www|(?P<player>player))\.)?
- vimeo(?P<pro>pro)?\.com/
- (?!channels/[^/?#]+/?(?:$|[?#])|album/)
- (?:.*?/)?
- (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)?
- (?:videos?/)?
- (?P<id>[0-9]+)
- /?(?:[?&].*)?(?:[#].*)?$'''
+ https?://
+ (?:
+ (?:
+ www|
+ (?P<player>player)
+ )
+ \.
+ )?
+ vimeo(?P<pro>pro)?\.com/
+ (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/)
+ (?:.*?/)?
+ (?:
+ (?:
+ play_redirect_hls|
+ moogaloop\.swf)\?clip_id=
+ )?
+ (?:videos?/)?
+ (?P<id>[0-9]+)
+ /?(?:[?&].*)?(?:[#].*)?$
+ '''
IE_NAME = 'vimeo'
_TESTS = [
{
@@ -93,6 +104,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
'description': 'md5:2d3305bad981a06ff79f027f19865021',
'upload_date': '20121220',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434',
'uploader_id': 'user7108434',
'uploader': 'Filippo Valsorda',
'duration': 10,
@@ -105,6 +117,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '68093876',
'ext': 'mp4',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus',
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
@@ -121,6 +134,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
'uploader': 'The BLN & Business of Software',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
'uploader_id': 'theblnbusinessofsoftware',
'duration': 3610,
'description': None,
@@ -135,6 +149,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'youtube-dl password protected test video',
'upload_date': '20130614',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
@@ -154,6 +169,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Key & Peele: Terrorist Interrogation',
'description': 'md5:8678b246399b070816b12313e8b4eb5c',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio',
'uploader_id': 'atencio',
'uploader': 'Peter Atencio',
'upload_date': '20130927',
@@ -169,6 +185,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': 'The New Vimeo Player (You Know, For Videos)',
'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
'upload_date': '20131015',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff',
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
@@ -183,6 +200,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Pier Solar OUYA Official Trailer',
'uploader': 'Tulio Gonçalves',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593',
'uploader_id': 'user28849593',
},
},
@@ -195,6 +213,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',
'uploader': 'The DMCI',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci',
'uploader_id': 'dmci',
'upload_date': '20111220',
'description': 'md5:ae23671e82d05415868f7ad1aec21147',
@@ -232,7 +251,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
return mobj.group(1)
def _verify_video_password(self, url, video_id, webpage):
- password = self._downloader.params.get('videopassword', None)
+ password = self._downloader.params.get('videopassword')
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
token, vuid = self._extract_xsrft_and_vuid(webpage)
@@ -252,7 +271,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'Verifying the password', 'Wrong password')
def _verify_player_video_password(self, url, video_id):
- password = self._downloader.params.get('videopassword', None)
+ password = self._downloader.params.get('videopassword')
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
data = urlencode_postdata(encode_dict({'password': password}))
@@ -269,9 +288,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
- headers = std_headers
+ headers = std_headers.copy()
if 'http_headers' in data:
- headers = headers.copy()
headers.update(data['http_headers'])
if 'Referer' not in headers:
headers['Referer'] = url
@@ -286,7 +304,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
- request = sanitized_Request(url, None, headers)
+ request = sanitized_Request(url, headers=headers)
try:
webpage = self._download_webpage(request, video_id)
except ExtractorError as ee:
@@ -368,16 +386,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
{'force_feature_id': True}), 'Vimeo')
# Extract title
- video_title = config["video"]["title"]
+ video_title = config['video']['title']
- # Extract uploader and uploader_id
- video_uploader = config["video"]["owner"]["name"]
- video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
+ # Extract uploader, uploader_url and uploader_id
+ video_uploader = config['video'].get('owner', {}).get('name')
+ video_uploader_url = config['video'].get('owner', {}).get('url')
+ video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
# Extract video thumbnail
- video_thumbnail = config["video"].get("thumbnail")
+ video_thumbnail = config['video'].get('thumbnail')
if video_thumbnail is None:
- video_thumbs = config["video"].get("thumbs")
+ video_thumbs = config['video'].get('thumbs')
if video_thumbs and isinstance(video_thumbs, dict):
_, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
@@ -401,7 +420,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
self._downloader.report_warning('Cannot find video description')
# Extract video duration
- video_duration = int_or_none(config["video"].get("duration"))
+ video_duration = int_or_none(config['video'].get('duration'))
# Extract upload date
video_upload_date = None
@@ -473,6 +492,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
return {
'id': video_id,
'uploader': video_uploader,
+ 'uploader_url': video_uploader_url,
'uploader_id': video_uploader_id,
'upload_date': video_upload_date,
'title': video_title,
@@ -488,6 +508,38 @@ class VimeoIE(VimeoBaseInfoExtractor):
}
+class VimeoOndemandIE(VimeoBaseInfoExtractor):
+ IE_NAME = 'vimeo:ondemand'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ # ondemand video not available via https://vimeo.com/id
+ 'url': 'https://vimeo.com/ondemand/20704',
+ 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35',
+ 'info_dict': {
+ 'id': '105442900',
+ 'ext': 'mp4',
+ 'title': 'המעבדה - במאי יותם פלדמן',
+ 'uploader': 'גם סרטים',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms',
+ 'uploader_id': 'gumfilms',
+ },
+ }, {
+ 'url': 'https://vimeo.com/ondemand/nazmaalik',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vimeo.com/ondemand/141692381',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key())
+
+
class VimeoChannelIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:channel'
_VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
@@ -516,7 +568,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
if not login_form:
return webpage
- password = self._downloader.params.get('videopassword', None)
+ password = self._downloader.params.get('videopassword')
if password is None:
raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
fields = self._hidden_inputs(login_form)
@@ -703,10 +755,10 @@ class VimeoLikesIE(InfoExtractor):
_TEST = {
'url': 'https://vimeo.com/user755559/likes/',
'playlist_mincount': 293,
- "info_dict": {
+ 'info_dict': {
'id': 'user755559_likes',
- "description": "See all the videos urza likes",
- "title": 'Videos urza likes',
+ 'description': 'See all the videos urza likes',
+ 'title': 'Videos urza likes',
},
}
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index cb2a4b0b5..a6a6cc479 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -119,7 +119,7 @@ class VineIE(InfoExtractor):
class VineUserIE(InfoExtractor):
IE_NAME = 'vine:user'
_VALID_URL = r'(?:https?://)?vine\.co/(?P<u>u/)?(?P<user>[^/]+)/?(\?.*)?$'
- _VINE_BASE_URL = "https://vine.co/"
+ _VINE_BASE_URL = 'https://vine.co/'
_TESTS = [
{
'url': 'https://vine.co/Visa',
@@ -139,7 +139,7 @@ class VineUserIE(InfoExtractor):
user = mobj.group('user')
u = mobj.group('u')
- profile_url = "%sapi/users/profiles/%s%s" % (
+ profile_url = '%sapi/users/profiles/%s%s' % (
self._VINE_BASE_URL, 'vanity/' if not u else '', user)
profile_data = self._download_json(
profile_url, user, note='Downloading user profile data')
@@ -147,7 +147,7 @@ class VineUserIE(InfoExtractor):
user_id = profile_data['data']['userId']
timeline_data = []
for pagenum in itertools.count(1):
- timeline_url = "%sapi/timelines/users/%s?page=%s&size=100" % (
+ timeline_url = '%sapi/timelines/users/%s?page=%s&size=100' % (
self._VINE_BASE_URL, user_id, pagenum)
timeline_page = self._download_json(
timeline_url, user, note='Downloading page %d' % pagenum)
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 0805e3c08..d560a4b5e 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -11,6 +11,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ int_or_none,
orderedSet,
sanitized_Request,
str_to_int,
@@ -141,10 +142,10 @@ class VKIE(InfoExtractor):
'url': 'https://vk.com/video276849682_170681728',
'info_dict': {
'id': 'V3K4mi0SYkc',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
- 'duration': 179,
+ 'duration': 178,
'upload_date': '20130116',
'uploader': "Children's Joy Foundation",
'uploader_id': 'thecjf',
@@ -152,6 +153,19 @@ class VKIE(InfoExtractor):
},
},
{
+ # video key is extra_data not url\d+
+ 'url': 'http://vk.com/video-110305615_171782105',
+ 'md5': 'e13fcda136f99764872e739d13fac1d1',
+ 'info_dict': {
+ 'id': '171782105',
+ 'ext': 'mp4',
+ 'title': 'S-Dance, репетиции к The way show',
+ 'uploader': 'THE WAY SHOW | 17 апреля',
+ 'upload_date': '20160207',
+ 'view_count': int,
+ },
+ },
+ {
# removed video, just testing that we match the pattern
'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
'only_matching': True,
@@ -298,12 +312,17 @@ class VKIE(InfoExtractor):
view_count = str_to_int(self._search_regex(
r'([\d,.]+)', views, 'view count', fatal=False))
- formats = [{
- 'format_id': k,
- 'url': v,
- 'width': int(k[len('url'):]),
- } for k, v in data.items()
- if k.startswith('url')]
+ formats = []
+ for k, v in data.items():
+ if not k.startswith('url') and k != 'extra_data' or not v:
+ continue
+ height = int_or_none(self._search_regex(
+ r'^url(\d+)', k, 'height', default=None))
+ formats.append({
+ 'format_id': k,
+ 'url': v,
+ 'height': height,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
index 01891ac4c..2b6bae89b 100644
--- a/youtube_dl/extractor/vrt.py
+++ b/youtube_dl/extractor/vrt.py
@@ -73,11 +73,16 @@ class VRTIE(InfoExtractor):
if mobj:
formats.extend(self._extract_m3u8_formats(
'%s/%s' % (mobj.group('server'), mobj.group('path')),
- video_id, 'mp4', m3u8_id='hls'))
+ video_id, 'mp4', m3u8_id='hls', fatal=False))
mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage)
if mobj:
formats.extend(self._extract_f4m_formats(
- '%s/manifest.f4m' % mobj.group('src'), video_id, f4m_id='hds'))
+ '%s/manifest.f4m' % mobj.group('src'),
+ video_id, f4m_id='hds', fatal=False))
+
+ if not formats and 'data-video-geoblocking="true"' in webpage:
+ self.raise_geo_restricted('This video is only available in Belgium')
+
self._sort_formats(formats)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index affcc52f6..37cf3d309 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -12,7 +12,7 @@ from ..utils import (
class WatIE(InfoExtractor):
- _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html'
+ _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)'
IE_NAME = 'wat.tv'
_TESTS = [
{
@@ -54,10 +54,12 @@ class WatIE(InfoExtractor):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
mobj = re.match(self._VALID_URL, url)
- short_id = mobj.group('short_id')
display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id or short_id)
- real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+ real_id = mobj.group('real_id')
+ if not real_id:
+ short_id = mobj.group('short_id')
+ webpage = self._download_webpage(url, display_id or short_id)
+ real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
video_info = self.download_video_info(real_id)
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
index 2037d9b3d..7aea47ed5 100644
--- a/youtube_dl/extractor/webofstories.py
+++ b/youtube_dl/extractor/webofstories.py
@@ -12,38 +12,52 @@ class WebOfStoriesIE(InfoExtractor):
_VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
_GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
_USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
- _TESTS = [
- {
- 'url': 'http://www.webofstories.com/play/hans.bethe/71',
- 'md5': '373e4dd915f60cfe3116322642ddf364',
- 'info_dict': {
- 'id': '4536',
- 'ext': 'mp4',
- 'title': 'The temperature of the sun',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Hans Bethe talks about calculating the temperature of the sun',
- 'duration': 238,
- }
+ _TESTS = [{
+ 'url': 'http://www.webofstories.com/play/hans.bethe/71',
+ 'md5': '373e4dd915f60cfe3116322642ddf364',
+ 'info_dict': {
+ 'id': '4536',
+ 'ext': 'mp4',
+ 'title': 'The temperature of the sun',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Hans Bethe talks about calculating the temperature of the sun',
+ 'duration': 238,
+ }
+ }, {
+ 'url': 'http://www.webofstories.com/play/55908',
+ 'md5': '2985a698e1fe3211022422c4b5ed962c',
+ 'info_dict': {
+ 'id': '55908',
+ 'ext': 'mp4',
+ 'title': 'The story of Gemmata obscuriglobus',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+ 'duration': 169,
+ },
+ 'skip': 'notfound',
+ }, {
+ # malformed og:title meta
+ 'url': 'http://www.webofstories.com/play/54215?o=MS',
+ 'info_dict': {
+ 'id': '54215',
+ 'ext': 'mp4',
+ 'title': '"A Leg to Stand On"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Oliver Sacks talks about the death and resurrection of a limb',
+ 'duration': 97,
},
- {
- 'url': 'http://www.webofstories.com/play/55908',
- 'md5': '2985a698e1fe3211022422c4b5ed962c',
- 'info_dict': {
- 'id': '55908',
- 'ext': 'mp4',
- 'title': 'The story of Gemmata obscuriglobus',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
- 'duration': 169,
- }
+ 'params': {
+ 'skip_download': True,
},
- ]
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
+ # Sometimes og:title meta is malformed
+ title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+ r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index 041ff6c55..fb0accac7 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -20,7 +20,7 @@ class WimpIE(InfoExtractor):
'md5': '4e2986c793694b55b37cf92521d12bb4',
'info_dict': {
'id': 'clowncar',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'It\'s like a clown car.',
'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',
},
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index fdb16d91c..41061dd31 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -35,7 +35,8 @@ class WistiaIE(InfoExtractor):
formats = []
thumbnails = []
- for atype, a in data['assets'].items():
+ for a in data['assets']:
+ atype = a.get('type')
if atype == 'still':
thumbnails.append({
'url': a['url'],
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index a3ea26feb..09415b589 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -8,12 +8,12 @@ from .common import InfoExtractor
class WorldStarHipHopIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
_TESTS = [{
- "url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
- "md5": "9d04de741161603bf7071bbf4e883186",
- "info_dict": {
- "id": "wshh6a7q1ny0G34ZwuIO",
- "ext": "mp4",
- "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+ 'url': 'http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO',
+ 'md5': '9d04de741161603bf7071bbf4e883186',
+ 'info_dict': {
+ 'id': 'wshh6a7q1ny0G34ZwuIO',
+ 'ext': 'mp4',
+ 'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!'
}
}, {
'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
@@ -21,7 +21,7 @@ class WorldStarHipHopIE(InfoExtractor):
'info_dict': {
'id': 'wshh6a7q1ny0G34ZwuIO',
'ext': 'mp4',
- "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+ 'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!'
}
}]
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index a3236e66c..94abdb4f3 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -17,7 +17,7 @@ class XFileShareIE(InfoExtractor):
IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/
+ (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
@@ -81,6 +81,13 @@ class XFileShareIE(InfoExtractor):
'ext': 'mp4',
'title': 'test'
}
+ }, {
+ 'url': 'http://powerwatch.pw/duecjibvicbu',
+ 'info_dict': {
+ 'id': 'duecjibvicbu',
+ 'ext': 'mp4',
+ 'title': 'Big Buck Bunny trailer',
+ },
}]
def _real_extract(self, url):
@@ -112,6 +119,7 @@ class XFileShareIE(InfoExtractor):
title = (self._search_regex(
[r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>',
+ r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+) ',
r'<h2 class="video-page-head">([^<]+)</h2>'],
webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 8cd3a0687..4075b8a4f 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -7,15 +7,17 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
- parse_duration,
+ orderedSet,
sanitized_Request,
str_to_int,
)
class XTubeIE(InfoExtractor):
- _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/watch\.php\?.*\bv=)(?P<id>[^/?&#]+)'
- _TEST = {
+ _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)'
+
+ _TESTS = [{
+ # old URL schema
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
@@ -27,63 +29,60 @@ class XTubeIE(InfoExtractor):
'duration': 450,
'age_limit': 18,
}
- }
+ }, {
+ # new URL schema
+ 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
+ 'only_matching': True,
+ }, {
+ 'url': 'xtube:625837',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- req = sanitized_Request('http://www.xtube.com/watch.php?v=%s' % video_id)
- req.add_header('Cookie', 'age_verified=1')
- webpage = self._download_webpage(req, video_id)
-
- video_title = self._html_search_regex(
- r'<p class="title">([^<]+)', webpage, 'title')
- video_uploader = self._html_search_regex(
- [r"var\s+contentOwnerId\s*=\s*'([^']+)",
- r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'],
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ if not display_id:
+ display_id = video_id
+ url = 'http://www.xtube.com/watch.php?v=%s' % video_id
+
+ req = sanitized_Request(url)
+ req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')
+ webpage = self._download_webpage(req, display_id)
+
+ flashvars = self._parse_json(
+ self._search_regex(
+ r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'),
+ video_id)['flashvars']
+
+ title = flashvars.get('title') or self._search_regex(
+ r'<h1>([^<]+)</h1>', webpage, 'title')
+ video_url = compat_urllib_parse_unquote(flashvars['video_url'])
+ duration = int_or_none(flashvars.get('video_duration'))
+
+ uploader = self._search_regex(
+ r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
webpage, 'uploader', fatal=False)
- video_description = self._html_search_regex(
- r'<p class="fieldsDesc">([^<]+)',
- webpage, 'description', fatal=False)
- duration = parse_duration(self._html_search_regex(
- r'<span class="bold">Runtime:</span> ([^<]+)</p>',
- webpage, 'duration', fatal=False))
- view_count = str_to_int(self._html_search_regex(
- r'<span class="bold">Views:</span> ([\d,\.]+)</p>',
+ description = self._search_regex(
+ r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
+ view_count = str_to_int(self._search_regex(
+ r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
- r'<div id="commentBar">([\d,\.]+) Comments</div>',
+ r'>Comments? \(([\d,\.]+)\)<',
webpage, 'comment count', fatal=False))
- formats = []
- for format_id, video_url in re.findall(
- r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
- fmt = {
- 'url': compat_urllib_parse_unquote(video_url),
- 'format_id': format_id,
- }
- m = re.search(r'^(?P<height>\d+)[pP]', format_id)
- if m:
- fmt['height'] = int(m.group('height'))
- formats.append(fmt)
-
- if not formats:
- video_url = compat_urllib_parse_unquote(self._search_regex(
- r'flashvars\.video_url\s*=\s*"([^"]+)"',
- webpage, 'video URL'))
- formats.append({'url': video_url})
-
- self._sort_formats(formats)
-
return {
'id': video_id,
- 'title': video_title,
- 'uploader': video_uploader,
- 'description': video_description,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
- 'formats': formats,
'age_limit': 18,
}
@@ -120,7 +119,8 @@ class XTubeUserIE(InfoExtractor):
if not html:
break
- for _, video_id in re.findall(r'data-plid=(["\'])(.+?)\1', html):
+ for video_id in orderedSet([video_id for _, video_id in re.findall(
+ r'data-plid=(["\'])(.+?)\1', html)]):
entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key()))
page_count = int_or_none(page.get('pageCount'))
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index d3cc1a29f..e699e663f 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -10,13 +10,27 @@ from ..compat import (
compat_urllib_parse,
)
from ..utils import (
+ ExtractorError,
int_or_none,
float_or_none,
sanitized_Request,
)
-class YandexMusicTrackIE(InfoExtractor):
+class YandexMusicBaseIE(InfoExtractor):
+ @staticmethod
+ def _handle_error(response):
+ error = response.get('error')
+ if error:
+ raise ExtractorError(error, expected=True)
+
+ def _download_json(self, *args, **kwargs):
+ response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
+ self._handle_error(response)
+ return response
+
+
+class YandexMusicTrackIE(YandexMusicBaseIE):
IE_NAME = 'yandexmusic:track'
IE_DESC = 'Яндекс.Музыка - Трек'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
@@ -73,7 +87,7 @@ class YandexMusicTrackIE(InfoExtractor):
return self._get_track_info(track)
-class YandexMusicPlaylistBaseIE(InfoExtractor):
+class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
def _build_playlist(self, tracks):
return [
self.url_result(
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index c642075dc..4150b28da 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -16,8 +16,8 @@ class YouJizzIE(InfoExtractor):
'info_dict': {
'id': '2189178',
'ext': 'flv',
- "title": "Zeichentrick 1",
- "age_limit": 18,
+ 'title': 'Zeichentrick 1',
+ 'age_limit': 18,
}
}
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 49687371a..900eb2aba 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -214,10 +214,10 @@ class YoukuIE(InfoExtractor):
return raw_data['data']
- video_password = self._downloader.params.get('videopassword', None)
+ video_password = self._downloader.params.get('videopassword')
# request basic data
- basic_data_url = "http://play.youku.com/play/get.json?vid=%s&ct=12" % video_id
+ basic_data_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % video_id
if video_password:
basic_data_url += '&pwd=%s' % video_password
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index b29baafc4..1124fe6c2 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor):
links = []
sources = self._search_regex(
- r'sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+ r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
if sources:
for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
links.append(link)
@@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor):
}
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+ # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# We will benefit from it by extracting some metadata
- mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+ mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
if mobj:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b9a91dea2..27e67feb4 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
import itertools
import json
import os.path
+import random
import re
import time
import traceback
@@ -375,14 +376,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube'
_TESTS = [
{
- 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
'info_dict': {
'id': 'BaW_jenozKc',
'ext': 'mp4',
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@@ -401,12 +404,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
'alt_title': 'I Love It (feat. Charli XCX)',
- 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',
+ 'license': 'Standard YouTube License',
'creator': 'Icona Pop',
}
},
@@ -422,6 +427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
+ 'license': 'Standard YouTube License',
'creator': 'Justin Timberlake',
'age_limit': 18,
}
@@ -437,11 +444,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
'uploader': 'SET India',
'uploader_id': 'setindia',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
}
},
{
- 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
'note': 'Use the first video ID in the URL',
'info_dict': {
'id': 'BaW_jenozKc',
@@ -449,7 +458,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@@ -468,8 +479,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'm4a',
'upload_date': '20121002',
'uploader_id': '8KVIDEO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
+ 'license': 'Standard YouTube License',
'title': 'UHDTV TEST 8K VIDEO.mp4'
},
'params': {
@@ -488,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
+ 'license': 'Standard YouTube License',
},
'params': {
'youtube_include_dash_manifest': True,
@@ -506,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
+ 'license': 'Standard YouTube License',
'creator': 'Taylor Swift',
},
'params': {
@@ -522,6 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20100909',
'uploader': 'The Amazing Atheist',
'uploader_id': 'TheAmazingAtheist',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
+ 'license': 'Standard YouTube License',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
@@ -536,7 +553,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@@ -550,7 +569,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
'upload_date': '20110629',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@@ -562,9 +583,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20100430',
'uploader_id': 'deadmau5',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',
'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5',
+ 'license': 'Standard YouTube License',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
},
@@ -580,6 +603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20150827',
'uploader_id': 'olympic',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
+ 'license': 'Standard YouTube License',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
@@ -597,8 +622,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'stretched_ratio': 16 / 9.,
'upload_date': '20110310',
'uploader_id': 'AllenMeow',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫艾倫',
+ 'license': 'Standard YouTube License',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
},
@@ -629,7 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:116377fd2963b81ec4ce64b542173306',
'upload_date': '20150625',
'uploader_id': 'dorappi2000',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000',
+ 'license': 'Standard YouTube License',
'formats': 'mincount:33',
},
},
@@ -644,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Airtek',
'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+ 'license': 'Standard YouTube License',
'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
},
'params': {
@@ -668,6 +698,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -678,6 +710,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -688,6 +722,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -698,6 +734,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}],
'params': {
@@ -705,6 +743,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
},
{
+ # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
+ 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
+ 'info_dict': {
+ 'id': 'gVfLd0zydlo',
+ 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
+ },
+ 'playlist_count': 2,
+ },
+ {
'url': 'http://vid.plus/FlRa-iH7PGw',
'only_matching': True,
},
@@ -722,7 +769,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'upload_date': '20151119',
'uploader_id': 'IronSoulElf',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf',
+ 'license': 'Standard YouTube License',
'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
},
'params': {
@@ -751,6 +800,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
},
{
+ # Video licensed under Creative Commons
+ 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
+ 'info_dict': {
+ 'id': 'M4gD1WSo5mA',
+ 'ext': 'mp4',
+ 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
+ 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
+ 'upload_date': '20150127',
+ 'uploader_id': 'BerkmanCenter',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
+ 'uploader': 'BerkmanCenter',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Channel-like uploader_url
+ 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
+ 'info_dict': {
+ 'id': 'eQcmzGIKrzg',
+ 'ext': 'mp4',
+ 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
+ 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+ 'upload_date': '20151119',
+ 'uploader': 'Bernie 2016',
+ 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
'only_matching': True,
}
@@ -966,40 +1051,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return {}
try:
args = player_config['args']
- caption_url = args['ttsurl']
- if not caption_url:
- self._downloader.report_warning(err_msg)
- return {}
- timestamp = args['timestamp']
- # We get the available subtitles
- list_params = compat_urllib_parse.urlencode({
- 'type': 'list',
- 'tlangs': 1,
- 'asrs': 1,
- })
- list_url = caption_url + '&' + list_params
- caption_list = self._download_xml(list_url, video_id)
- original_lang_node = caption_list.find('track')
- if original_lang_node is None:
- self._downloader.report_warning('Video doesn\'t have automatic captions')
- return {}
- original_lang = original_lang_node.attrib['lang_code']
- caption_kind = original_lang_node.attrib.get('kind', '')
+ caption_url = args.get('ttsurl')
+ if caption_url:
+ timestamp = args['timestamp']
+ # We get the available subtitles
+ list_params = compat_urllib_parse.urlencode({
+ 'type': 'list',
+ 'tlangs': 1,
+ 'asrs': 1,
+ })
+ list_url = caption_url + '&' + list_params
+ caption_list = self._download_xml(list_url, video_id)
+ original_lang_node = caption_list.find('track')
+ if original_lang_node is None:
+ self._downloader.report_warning('Video doesn\'t have automatic captions')
+ return {}
+ original_lang = original_lang_node.attrib['lang_code']
+ caption_kind = original_lang_node.attrib.get('kind', '')
+
+ sub_lang_list = {}
+ for lang_node in caption_list.findall('target'):
+ sub_lang = lang_node.attrib['lang_code']
+ sub_formats = []
+ for ext in self._SUBTITLE_FORMATS:
+ params = compat_urllib_parse.urlencode({
+ 'lang': original_lang,
+ 'tlang': sub_lang,
+ 'fmt': ext,
+ 'ts': timestamp,
+ 'kind': caption_kind,
+ })
+ sub_formats.append({
+ 'url': caption_url + '&' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[sub_lang] = sub_formats
+ return sub_lang_list
+
+ # Some videos don't provide ttsurl but rather caption_tracks and
+ # caption_translation_languages (e.g. 20LmZk1hakA)
+ caption_tracks = args['caption_tracks']
+ caption_translation_languages = args['caption_translation_languages']
+ caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+ parsed_caption_url = compat_urlparse.urlparse(caption_url)
+ caption_qs = compat_parse_qs(parsed_caption_url.query)
sub_lang_list = {}
- for lang_node in caption_list.findall('target'):
- sub_lang = lang_node.attrib['lang_code']
+ for lang in caption_translation_languages.split(','):
+ lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+ sub_lang = lang_qs.get('lc', [None])[0]
+ if not sub_lang:
+ continue
sub_formats = []
for ext in self._SUBTITLE_FORMATS:
- params = compat_urllib_parse.urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': ext,
- 'ts': timestamp,
- 'kind': caption_kind,
+ caption_qs.update({
+ 'tlang': [sub_lang],
+ 'fmt': [ext],
})
+ sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
+ query=compat_urllib_parse.urlencode(caption_qs, True)))
sub_formats.append({
- 'url': caption_url + '&' + params,
+ 'url': sub_url,
'ext': ext,
})
sub_lang_list[sub_lang] = sub_formats
@@ -1010,6 +1122,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning(err_msg)
return {}
+ def _mark_watched(self, video_id, video_info):
+ playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+ if not playback_url:
+ return
+ parsed_playback_url = compat_urlparse.urlparse(playback_url)
+ qs = compat_urlparse.parse_qs(parsed_playback_url.query)
+
+ # cpn generation algorithm is reverse engineered from base.js.
+ # In fact it works even with dummy cpn.
+ CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
+ cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
+
+ qs.update({
+ 'ver': ['2'],
+ 'cpn': [cpn],
+ })
+ playback_url = compat_urlparse.urlunparse(
+ parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
+ self._download_webpage(
+ playback_url, video_id, 'Marking watched',
+ 'Unable to mark watched', fatal=False)
+
@classmethod
def extract_id(cls, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
@@ -1196,9 +1331,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not self._downloader.params.get('noplaylist'):
entries = []
feed_ids = []
- multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
+ multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
for feed in multifeed_metadata_list.split(','):
- feed_data = compat_parse_qs(feed)
+ # Unquote should take place before split on comma (,) since textual
+ # fields may contain comma as well (see
+ # https://github.com/rg3/youtube-dl/issues/8536)
+ feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
entries.append({
'_type': 'url_transparent',
'ie_key': 'Youtube',
@@ -1233,9 +1371,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# uploader_id
video_uploader_id = None
- mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
+ video_uploader_url = None
+ mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+ video_webpage)
if mobj is not None:
- video_uploader_id = mobj.group(1)
+ video_uploader_id = mobj.group('uploader_id')
+ video_uploader_url = mobj.group('uploader_url')
else:
self._downloader.report_warning('unable to extract uploader nickname')
@@ -1263,6 +1405,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
+ video_license = self._html_search_regex(
+ r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+ video_webpage, 'license', default=None)
+
m_music = re.search(
r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
video_webpage)
@@ -1336,6 +1482,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
+ formats_spec = {}
+ fmt_list = video_info.get('fmt_list', [''])[0]
+ if fmt_list:
+ for fmt in fmt_list.split(','):
+ spec = fmt.split('/')
+ if len(spec) > 1:
+ width_height = spec[1].split('x')
+ if len(width_height) == 2:
+ formats_spec[spec[0]] = {
+ 'resolution': spec[1],
+ 'width': int_or_none(width_height[0]),
+ 'height': int_or_none(width_height[1]),
+ }
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
@@ -1404,6 +1563,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
if format_id in self._formats:
dct.update(self._formats[format_id])
+ if format_id in formats_spec:
+ dct.update(formats_spec[format_id])
# Some itags are not included in DASH manifest thus corresponding formats will
# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
@@ -1516,11 +1677,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._sort_formats(formats)
+ self.mark_watched(video_id, video_info)
+
return {
'id': video_id,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
+ 'uploader_url': video_uploader_url,
'upload_date': upload_date,
+ 'license': video_license,
'creator': video_creator,
'title': video_title,
'alt_title': video_alt_title,
@@ -1689,13 +1854,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
- def _real_extract(self, url):
- # Extract playlist id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
- playlist_id = mobj.group(1) or mobj.group(2)
-
+ def _check_download_just_video(self, url, playlist_id):
# Check if it's a video-specific URL
query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
if 'v' in query_dict:
@@ -1706,6 +1865,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
else:
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ def _real_extract(self, url):
+ # Extract playlist id
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError('Invalid URL: %s' % url)
+ playlist_id = mobj.group(1) or mobj.group(2)
+
+ video = self._check_download_just_video(url, playlist_id)
+ if video:
+ return video
+
if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
@@ -1900,13 +2070,16 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
class YoutubeSearchURLIE(InfoExtractor):
IE_DESC = 'YouTube.com search URLs'
IE_NAME = 'youtube:search_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
'info_dict': {
'title': 'youtube-dl test video',
}
+ }, {
+ 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -2011,11 +2184,20 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
class YoutubeWatchLaterIE(YoutubePlaylistIE):
IE_NAME = 'youtube:watchlater'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+ _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
- _TESTS = [] # override PlaylistIE tests
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/playlist?list=WL',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
+ video = self._check_download_just_video(url, 'WL')
+ if video:
+ return video
return self._extract_playlist('WL')
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index c619a75e2..81c22a627 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -137,6 +137,10 @@ class ZDFIE(InfoExtractor):
formats.extend(self._extract_smil_formats(
video_url, video_id, fatal=False))
elif ext == 'm3u8':
+ # the certificates are misconfigured (see
+ # https://github.com/rg3/youtube-dl/issues/8665)
+ if video_url.startswith('https://'):
+ continue
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':