aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py34
-rw-r--r--youtube_dl/aes.py2
-rw-r--r--youtube_dl/extractor/__init__.py35
-rw-r--r--youtube_dl/extractor/aftonbladet.py11
-rw-r--r--youtube_dl/extractor/arte.py5
-rw-r--r--youtube_dl/extractor/bilibili.py20
-rw-r--r--youtube_dl/extractor/cbsnews.py2
-rw-r--r--youtube_dl/extractor/ccc.py4
-rw-r--r--youtube_dl/extractor/chilloutzone.py2
-rw-r--r--youtube_dl/extractor/cinemassacre.py13
-rw-r--r--youtube_dl/extractor/cnn.py2
-rw-r--r--youtube_dl/extractor/common.py4
-rw-r--r--youtube_dl/extractor/crunchyroll.py30
-rw-r--r--youtube_dl/extractor/dailymotion.py5
-rw-r--r--youtube_dl/extractor/drtv.py37
-rw-r--r--youtube_dl/extractor/empflix.py32
-rw-r--r--youtube_dl/extractor/espn.py55
-rw-r--r--youtube_dl/extractor/facebook.py11
-rw-r--r--youtube_dl/extractor/firedrive.py80
-rw-r--r--youtube_dl/extractor/gamespot.py62
-rw-r--r--youtube_dl/extractor/generic.py231
-rw-r--r--youtube_dl/extractor/imgur.py6
-rw-r--r--youtube_dl/extractor/instagram.py8
-rw-r--r--youtube_dl/extractor/iprima.py16
-rw-r--r--youtube_dl/extractor/izlesene.py18
-rw-r--r--youtube_dl/extractor/karrierevideos.py96
-rw-r--r--youtube_dl/extractor/letv.py4
-rw-r--r--youtube_dl/extractor/mitele.py9
-rw-r--r--youtube_dl/extractor/naver.py24
-rw-r--r--youtube_dl/extractor/nba.py20
-rw-r--r--youtube_dl/extractor/nextmedia.py40
-rw-r--r--youtube_dl/extractor/nova.py179
-rw-r--r--youtube_dl/extractor/nowtv.py192
-rw-r--r--youtube_dl/extractor/odnoklassniki.py36
-rw-r--r--youtube_dl/extractor/ooyala.py155
-rw-r--r--youtube_dl/extractor/patreon.py2
-rw-r--r--youtube_dl/extractor/porn91.py71
-rw-r--r--youtube_dl/extractor/pornhub.py3
-rw-r--r--youtube_dl/extractor/pornovoisines.py4
-rw-r--r--youtube_dl/extractor/prosiebensat1.py2
-rw-r--r--youtube_dl/extractor/qqmusic.py89
-rw-r--r--youtube_dl/extractor/rtbf.py30
-rw-r--r--youtube_dl/extractor/rtlnow.py174
-rw-r--r--youtube_dl/extractor/rts.py1
-rw-r--r--youtube_dl/extractor/rtve.py2
-rw-r--r--youtube_dl/extractor/rutv.py2
-rw-r--r--youtube_dl/extractor/sbs.py18
-rw-r--r--youtube_dl/extractor/senateisvp.py8
-rw-r--r--youtube_dl/extractor/shared.py2
-rw-r--r--youtube_dl/extractor/sockshare.py83
-rw-r--r--youtube_dl/extractor/sohu.py4
-rw-r--r--youtube_dl/extractor/soompi.py146
-rw-r--r--youtube_dl/extractor/spankwire.py2
-rw-r--r--youtube_dl/extractor/spiegeltv.py9
-rw-r--r--youtube_dl/extractor/sportbox.py131
-rw-r--r--youtube_dl/extractor/teamcoco.py32
-rw-r--r--youtube_dl/extractor/telecinco.py4
-rw-r--r--youtube_dl/extractor/tenplay.py27
-rw-r--r--youtube_dl/extractor/tf1.py12
-rw-r--r--youtube_dl/extractor/tnaflix.py44
-rw-r--r--youtube_dl/extractor/tube8.py14
-rw-r--r--youtube_dl/extractor/tubitv.py84
-rw-r--r--youtube_dl/extractor/tumblr.py17
-rw-r--r--youtube_dl/extractor/tutv.py2
-rw-r--r--youtube_dl/extractor/tv2.py126
-rw-r--r--youtube_dl/extractor/tvigle.py39
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py4
-rw-r--r--youtube_dl/extractor/ultimedia.py10
-rw-r--r--youtube_dl/extractor/vgtv.py28
-rw-r--r--youtube_dl/extractor/videott.py2
-rw-r--r--youtube_dl/extractor/vidme.py9
-rw-r--r--youtube_dl/extractor/vier.py9
-rw-r--r--youtube_dl/extractor/viki.py333
-rw-r--r--youtube_dl/extractor/vuclip.py2
-rw-r--r--youtube_dl/extractor/vulture.py2
-rw-r--r--youtube_dl/extractor/wimp.py3
-rw-r--r--youtube_dl/extractor/xminus.py4
-rw-r--r--youtube_dl/extractor/yahoo.py18
-rw-r--r--youtube_dl/extractor/youtube.py182
-rw-r--r--youtube_dl/options.py4
-rw-r--r--youtube_dl/postprocessor/embedthumbnail.py4
-rw-r--r--youtube_dl/utils.py10
-rw-r--r--youtube_dl/version.py2
83 files changed, 2227 insertions, 1063 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 691f3e09f..aa6ec9d9a 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -49,6 +49,7 @@ from .utils import (
ExtractorError,
format_bytes,
formatSeconds,
+ HEADRequest,
locked_file,
make_HTTPS_handler,
MaxDownloadsReached,
@@ -759,7 +760,9 @@ class YoutubeDL(object):
if isinstance(ie_entries, list):
n_all_entries = len(ie_entries)
if playlistitems:
- entries = [ie_entries[i - 1] for i in playlistitems]
+ entries = [
+ ie_entries[i - 1] for i in playlistitems
+ if -n_all_entries <= i - 1 < n_all_entries]
else:
entries = ie_entries[playliststart:playlistend]
n_entries = len(entries)
@@ -921,8 +924,9 @@ class YoutubeDL(object):
if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
if audiovideo_formats:
return audiovideo_formats[format_idx]
- # for audio only urls, select the best/worst audio format
- elif all(f.get('acodec') != 'none' for f in available_formats):
+ # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+ elif (all(f.get('acodec') != 'none' for f in available_formats) or
+ all(f.get('vcodec') != 'none' for f in available_formats)):
return available_formats[format_idx]
elif format_spec == 'bestaudio':
audio_formats = [
@@ -1045,6 +1049,8 @@ class YoutubeDL(object):
if not formats:
raise ExtractorError('No video formats found!')
+ formats_dict = {}
+
# We check that all the formats have the format and format_id fields
for i, format in enumerate(formats):
if 'url' not in format:
@@ -1052,6 +1058,18 @@ class YoutubeDL(object):
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
+ format_id = format['format_id']
+ if format_id not in formats_dict:
+ formats_dict[format_id] = []
+ formats_dict[format_id].append(format)
+
+ # Make sure all formats have unique format_id
+ for format_id, ambiguous_formats in formats_dict.items():
+ if len(ambiguous_formats) > 1:
+ for i, format in enumerate(ambiguous_formats):
+ format['format_id'] = '%s-%d' % (format_id, i)
+
+ for i, format in enumerate(formats):
if format.get('format') is None:
format['format'] = '{id} - {res}{note}'.format(
id=format['format_id'],
@@ -1366,7 +1384,7 @@ class YoutubeDL(object):
postprocessors = []
self.report_warning('You have requested multiple '
'formats but ffmpeg or avconv are not installed.'
- ' The formats won\'t be merged')
+ ' The formats won\'t be merged.')
else:
postprocessors = [merger]
@@ -1393,8 +1411,8 @@ class YoutubeDL(object):
requested_formats = info_dict['requested_formats']
if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
info_dict['ext'] = 'mkv'
- self.report_warning('You have requested formats incompatible for merge. '
- 'The formats will be merged into mkv')
+ self.report_warning(
+ 'Requested formats are incompatible for merge and will be merged into mkv.')
# Ensure filename always has a correct extension for successful merge
filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
if os.path.exists(encodeFilename(filename)):
@@ -1525,6 +1543,7 @@ class YoutubeDL(object):
pps_chain.extend(ie_info['__postprocessors'])
pps_chain.extend(self._pps)
for pp in pps_chain:
+ files_to_delete = []
try:
files_to_delete, info = pp.run(info)
except PostProcessingError as e:
@@ -1703,7 +1722,8 @@ class YoutubeDL(object):
if req_is_string:
req = url_escaped
else:
- req = compat_urllib_request.Request(
+ req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+ req = req_type(
url_escaped, data=req.data, headers=req.headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py
index 07224d508..7817adcfd 100644
--- a/youtube_dl/aes.py
+++ b/youtube_dl/aes.py
@@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):
"""
NONCE_LENGTH_BYTES = 8
- data = bytes_to_intlist(base64.b64decode(data))
+ data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
password = bytes_to_intlist(password.encode('utf-8'))
key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 860023d14..631381eea 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -141,6 +141,7 @@ from .engadget import EngadgetIE
from .eporner import EpornerIE
from .eroprofile import EroProfileIE
from .escapist import EscapistIE
+from .espn import ESPNIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
@@ -148,7 +149,6 @@ from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
-from .firedrive import FiredriveIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
@@ -243,6 +243,7 @@ from .kaltura import KalturaIE
from .kanalplay import KanalPlayIE
from .kankan import KankanIE
from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
@@ -337,8 +338,7 @@ from .newstube import NewstubeIE
from .nextmedia import (
NextMediaIE,
NextMediaActionNewsIE,
- AppleDailyRealtimeNewsIE,
- AppleDailyAnimationNewsIE
+ AppleDailyIE,
)
from .nfb import NFBIE
from .nfl import NFLIE
@@ -352,8 +352,10 @@ from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
+from .nova import NovaIE
from .novamov import NovaMovIE
from .nowness import NownessIE
+from .nowtv import NowTVIE
from .nowvideo import NowVideoIE
from .npo import (
NPOIE,
@@ -376,7 +378,10 @@ from .nytimes import (
from .nuvid import NuvidIE
from .odnoklassniki import OdnoklassnikiIE
from .oktoberfesttv import OktoberfestTVIE
-from .ooyala import OoyalaIE
+from .ooyala import (
+ OoyalaIE,
+ OoyalaExternalIE,
+)
from .openfilm import OpenFilmIE
from .orf import (
ORFTVthekIE,
@@ -397,6 +402,7 @@ from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .playwire import PlaywireIE
from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
from .pornhd import PornHdIE
from .pornhub import (
PornHubIE,
@@ -434,7 +440,6 @@ from .roxwel import RoxwelIE
from .rtbf import RTBFIE
from .rte import RteIE
from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
from .rtl2 import RTL2IE
from .rtp import RTPIE
from .rts import RTSIE
@@ -477,8 +482,11 @@ from .smotri import (
SmotriBroadcastIE,
)
from .snotr import SnotrIE
-from .sockshare import SockshareIE
from .sohu import SohuIE
+from .soompi import (
+ SoompiIE,
+ SoompiShowIE,
+)
from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
@@ -503,7 +511,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
from .sport5 import Sport5IE
-from .sportbox import SportBoxIE
+from .sportbox import (
+ SportBoxIE,
+ SportBoxEmbedIE,
+)
from .sportdeutschland import SportDeutschlandIE
from .srf import SrfIE
from .srmediathek import SRMediathekIE
@@ -561,11 +572,16 @@ from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tunein import TuneInIE
from .turbo import TurboIE
from .tutv import TutvIE
+from .tv2 import (
+ TV2IE,
+ TV2ArticleIE,
+)
from .tv4 import TV4IE
from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
@@ -637,7 +653,10 @@ from .vine import (
VineIE,
VineUserIE,
)
-from .viki import VikiIE
+from .viki import (
+ VikiIE,
+ VikiChannelIE,
+)
from .vk import (
VKIE,
VKUserVideosIE,
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
index a117502bc..e0518cf26 100644
--- a/youtube_dl/extractor/aftonbladet.py
+++ b/youtube_dl/extractor/aftonbladet.py
@@ -6,11 +6,11 @@ from ..utils import int_or_none
class AftonbladetIE(InfoExtractor):
- _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])'
+ _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+ 'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'info_dict': {
- 'id': 'article36015',
+ 'id': '36015',
'ext': 'mp4',
'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
# find internal video meta data
meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
- internal_meta_id = self._html_search_regex(
- r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+ player_config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+ internal_meta_id = player_config['videoId']
internal_meta_url = meta_url % internal_meta_id
internal_meta_json = self._download_json(
internal_meta_url, video_id, 'Downloading video meta data')
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 8273bd6c9..76de24477 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -7,7 +7,6 @@ from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
unified_strdate,
- get_element_by_id,
get_element_by_attribute,
int_or_none,
qualities,
@@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
def _real_extract(self, url):
anchor_id, lang = self._extract_url_info(url)
webpage = self._download_webpage(url, anchor_id)
- row = get_element_by_id(anchor_id, webpage)
+ row = self._search_regex(
+ r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+ webpage, 'row')
return self._extract_from_webpage(row, anchor_id, lang)
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 7ca835e31..2103ed73a 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
import re
import itertools
+import json
+import xml.etree.ElementTree as ET
from .common import InfoExtractor
from ..utils import (
@@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor):
entries = []
- lq_doc = self._download_xml(
+ lq_page = self._download_webpage(
'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
video_id,
note='Downloading LQ video info'
)
+ try:
+ err_info = json.loads(lq_page)
+ raise ExtractorError(
+ 'BiliBili said: ' + err_info['error_text'], expected=True)
+ except ValueError:
+ pass
+
+ lq_doc = ET.fromstring(lq_page)
lq_durls = lq_doc.findall('./durl')
hq_doc = self._download_xml(
@@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor):
note='Downloading HQ video info',
fatal=False,
)
- hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None)
-
- assert len(lq_durls) == len(hq_durls)
+ if hq_doc is not False:
+ hq_durls = hq_doc.findall('./durl')
+ assert len(lq_durls) == len(hq_durls)
+ else:
+ hq_durls = itertools.repeat(None)
i = 1
for lq_durl, hq_durl in zip(lq_durls, hq_durls):
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 7e47960ab..52e61d85b 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor):
'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
'ext': 'flv',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
- 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'duration': 205,
},
'params': {
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py
index 2a5d4be18..6924eac70 100644
--- a/youtube_dl/extractor/ccc.py
+++ b/youtube_dl/extractor/ccc.py
@@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):
_TEST = {
'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
- 'md5': '205a365d0d57c0b1e43a12c9ffe8f9be',
+ 'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': {
'id': '20131228183',
'ext': 'mp4',
@@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):
matches = re.finditer(r'''(?xs)
<(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
- <a\s+href='(?P<http_url>[^']+)'>\s*
+ <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
(?:
.*?
<a\s+href='(?P<torrent_url>[^']+\.torrent)'
diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py
index c922f6959..0206d96db 100644
--- a/youtube_dl/extractor/chilloutzone.py
+++ b/youtube_dl/extractor/chilloutzone.py
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
- decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+ decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index cf0a7551b..c949a4814 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor):
'uploader_id': 'Cinemassacre',
'title': 'AVGN: McKids',
}
+ },
+ {
+ 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+ 'md5': '1376908e49572389e7b06251a53cdd08',
+ 'info_dict': {
+ 'id': 'Cinemassacre-555779690c440',
+ 'ext': 'mp4',
+ 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+ 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+ 'upload_date': '20150525',
+ }
}
]
@@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor):
playerdata_url = self._search_regex(
[
- r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
],
webpage, 'player data URL', default=None)
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 5efc5f4fe..3b1bd4033 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -12,7 +12,7 @@ from ..utils import (
class CNNIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
- (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 65bb77086..cecf917ff 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -786,8 +786,8 @@ class InfoExtractor(object):
return True
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
- self.report_warning(
- '%s URL is invalid, skipping' % item, video_id)
+ self.to_screen(
+ '%s: %s URL is invalid, skipping' % (video_id, item))
return False
raise
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 1c77df47e..41f0c736d 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor):
self._login()
def _decrypt_subtitles(self, data, iv, id):
- data = bytes_to_intlist(data)
- iv = bytes_to_intlist(iv)
+ data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+ iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
id = int(id)
def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output
+ def _extract_subtitles(self, subtitle):
+ sub_root = xml.etree.ElementTree.fromstring(subtitle)
+ return [{
+ 'ext': 'srt',
+ 'data': self._convert_subtitles_to_srt(sub_root),
+ }, {
+ 'ext': 'ass',
+ 'data': self._convert_subtitles_to_ass(sub_root),
+ }]
+
def _get_subtitles(self, video_id, webpage):
subtitles = {}
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
if not id or not iv or not data:
continue
- id = int(id)
- iv = base64.b64decode(iv)
- data = base64.b64decode(data)
-
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
- sub_root = xml.etree.ElementTree.fromstring(subtitle)
- subtitles[lang_code] = [
- {
- 'ext': 'srt',
- 'data': self._convert_subtitles_to_srt(sub_root),
- },
- {
- 'ext': 'ass',
- 'data': self._convert_subtitles_to_ass(sub_root),
- },
- ]
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
return subtitles
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index db10b8d00..70aa4333c 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = 'dailymotion:user'
- _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
@@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
- webpage = self._download_webpage(url, user)
+ webpage = self._download_webpage(
+ 'https://www.dailymotion.com/user/%s' % user, user)
full_user = unescapeHTML(self._html_search_regex(
r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
webpage, 'user'))
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index f25ab319e..baa24c6d1 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -1,8 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+)
class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
restricted_to_denmark = asset['RestrictedToDenmark']
spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
for link in asset['Links']:
- target = link['Target']
uri = link['Uri']
+ target = link['Target']
format_id = target
- preference = -1 if target == 'HDS' else -2
+ preference = None
if spoken_subtitles:
- preference -= 2
+ preference = -1
format_id += '-spoken-subtitles'
- formats.append({
- 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
- 'format_id': format_id,
- 'ext': link['FileFormat'],
- 'preference': preference,
- })
+ if target == 'HDS':
+ formats.extend(self._extract_f4m_formats(
+ uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+ video_id, preference, f4m_id=format_id))
+ elif target == 'HLS':
+ formats.extend(self._extract_m3u8_formats(
+ uri, video_id, 'mp4', preference=preference,
+ m3u8_id=format_id))
+ else:
+ bitrate = link.get('Bitrate')
+ if bitrate:
+ format_id += '-%s' % bitrate
+ formats.append({
+ 'url': uri,
+ 'format_id': format_id,
+ 'tbr': bitrate,
+ 'ext': link.get('FileFormat'),
+ })
subtitles_list = asset.get('SubtitlesList')
if isinstance(subtitles_list, list):
LANGS = {
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
index 70f8efe27..4827022e0 100644
--- a/youtube_dl/extractor/empflix.py
+++ b/youtube_dl/extractor/empflix.py
@@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE
class EMPFlixIE(TNAFlixIE):
- _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
_TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
_DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
_CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
- _TEST = {
- 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
- 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
- 'info_dict': {
- 'id': '33051',
- 'display_id': 'Amateur-Finger-Fuck',
- 'ext': 'mp4',
- 'title': 'Amateur Finger Fuck',
- 'description': 'Amateur solo finger fucking.',
- 'thumbnail': 're:https?://.*\.jpg$',
- 'age_limit': 18,
+ _TESTS = [
+ {
+ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+ 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+ 'info_dict': {
+ 'id': '33051',
+ 'display_id': 'Amateur-Finger-Fuck',
+ 'ext': 'mp4',
+ 'title': 'Amateur Finger Fuck',
+ 'description': 'Amateur solo finger fucking.',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ },
+ {
+ 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+ 'only_matching': True,
}
- }
+ ]
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
new file mode 100644
index 000000000..e6f8f0337
--- /dev/null
+++ b/youtube_dl/extractor/espn.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ESPNIE(InfoExtractor):
+ _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+ _WORKING = False
+ _TESTS = [{
+ 'url': 'http://espn.go.com/video/clip?id=10365079',
+ 'info_dict': {
+ 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+ 'ext': 'mp4',
+ 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+ 'description': '',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/nba/recap?gameId=400793786',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'class="video-play-button"[^>]+data-id="(\d+)',
+ webpage, 'video id')
+
+ player = self._download_webpage(
+ 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id)
+
+ pcode = self._search_regex(
+ r'["\']pcode=([^"\']+)["\']', player, 'pcode')
+
+ return self.url_result(
+ 'ooyalaexternal:espn:%s:%s' % (video_id, pcode),
+ 'OoyalaExternal')
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 937b28fcc..82dc27bc6 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor):
'id': '274175099429670',
'ext': 'mp4',
'title': 'Facebook video #274175099429670',
- }
+ },
+ 'expected_warnings': [
+ 'title'
+ ]
}, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
@@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor):
raise ExtractorError('Cannot find video formats')
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
- fatal=False)
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+ default=None)
if not video_title:
video_title = self._html_search_regex(
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
- webpage, 'alternative title', default=None)
+ webpage, 'alternative title', fatal=False)
video_title = limit_length(video_title, 80)
if not video_title:
video_title = 'Facebook video #%s' % video_id
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644
index 3191116d9..000000000
--- a/youtube_dl/extractor/firedrive.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
- '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
- _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
- _TESTS = [{
- 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
- 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
- 'info_dict': {
- 'id': 'FEB892FA160EBD01',
- 'ext': 'flv',
- 'title': 'bbb_theora_486kbit.flv',
- 'thumbnail': 're:^http://.*\.jpg$',
- },
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://firedrive.com/file/%s' % video_id
- webpage = self._download_webpage(url, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', webpage))
-
- post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- # Apparently, this header is required for confirmation to work.
- req.add_header('Host', 'www.firedrive.com')
-
- webpage = self._download_webpage(req, video_id,
- 'Downloading video page')
-
- title = self._search_regex(r'class="external_title_left">(.+)</div>',
- webpage, 'title')
- thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
- 'thumbnail', fatal=False)
- if thumbnail is not None:
- thumbnail = 'http:' + thumbnail
-
- ext = self._search_regex(r'type:\s?\'([^\']+)\',',
- webpage, 'extension', fatal=False)
- video_url = self._search_regex(
- r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'ext': ext,
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 47373e215..2d33fa7f5 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -14,8 +14,8 @@ from ..utils import (
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
- _TEST = {
+ _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
+ _TESTS = [{
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
'info_dict': {
@@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor):
'ext': 'mp4',
'title': 'Arma 3 - Community Guide: SITREP I',
'description': 'Check out this video where some of the basics of Arma 3 is explained.',
- }
- }
+ },
+ }, {
+ 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+ 'info_dict': {
+ 'id': 'gs-2300-6424837',
+ 'ext': 'flv',
+ 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing',
+ 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
+ },
+ }]
def _real_extract(self, url):
page_id = self._match_id(url)
@@ -32,25 +40,37 @@ class GameSpotIE(InfoExtractor):
data_video_json = self._search_regex(
r'data-video=["\'](.*?)["\']', webpage, 'data video')
data_video = json.loads(unescapeHTML(data_video_json))
+ streams = data_video['videoStreams']
- # Transform the manifest url to a link to the mp4 files
- # they are used in mobile devices.
- f4m_url = data_video['videoStreams']['f4m_stream']
- f4m_path = compat_urlparse.urlparse(f4m_url).path
- QUALITIES_RE = r'((,\d+)+,?)'
- qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
- http_path = f4m_path[1:].split('/', 1)[1]
- http_template = re.sub(QUALITIES_RE, r'%s', http_path)
- http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin(
- 'http://video.gamespotcdn.com/', http_template)
formats = []
- for q in qualities:
- formats.append({
- 'url': http_template % q,
- 'ext': 'mp4',
- 'format_id': q,
- })
+ f4m_url = streams.get('f4m_stream')
+ if f4m_url is not None:
+ # Transform the manifest url to a link to the mp4 files
+ # they are used in mobile devices.
+ f4m_path = compat_urlparse.urlparse(f4m_url).path
+ QUALITIES_RE = r'((,\d+)+,?)'
+ qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
+ http_path = f4m_path[1:].split('/', 1)[1]
+ http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+ http_template = http_template.replace('.csmil/manifest.f4m', '')
+ http_template = compat_urlparse.urljoin(
+ 'http://video.gamespotcdn.com/', http_template)
+ for q in qualities:
+ formats.append({
+ 'url': http_template % q,
+ 'ext': 'mp4',
+ 'format_id': q,
+ })
+ else:
+ for quality in ['sd', 'hd']:
+ # It's actually a link to a flv file
+ flv_url = streams.get('f4m_{0}'.format(quality))
+ if flv_url is not None:
+ formats.append({
+ 'url': flv_url,
+ 'ext': 'flv',
+ 'format_id': quality,
+ })
return {
'id': data_video['guid'],
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 3d756e848..96ca398de 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -9,6 +9,8 @@ from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
)
@@ -32,6 +34,7 @@ from .brightcove import BrightcoveIE
from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
+from .sportbox import SportBoxEmbedIE
from .smotri import SmotriIE
from .condenast import CondeNastIE
from .udn import UDNEmbedIE
@@ -45,6 +48,97 @@ class GenericIE(InfoExtractor):
_VALID_URL = r'.*'
IE_NAME = 'generic'
_TESTS = [
+ # Direct link to a video
+ {
+ 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+ 'info_dict': {
+ 'id': 'trailer',
+ 'ext': 'mp4',
+ 'title': 'trailer',
+ 'upload_date': '20100513',
+ }
+ },
+ # Direct link to media delivered compressed (until Accept-Encoding is *)
+ {
+ 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+ 'md5': '128c42e68b13950268b648275386fc74',
+ 'info_dict': {
+ 'id': 'FictionJunction-Parallel_Hearts',
+ 'ext': 'flac',
+ 'title': 'FictionJunction-Parallel_Hearts',
+ 'upload_date': '20140522',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # RSS feed
+ {
+ 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'info_dict': {
+ 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'title': 'Zero Punctuation',
+ 'description': 're:.*groundbreaking video review series.*'
+ },
+ 'playlist_mincount': 11,
+ },
+ # RSS feed with enclosure
+ {
+ 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'info_dict': {
+ 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ 'ext': 'm4v',
+ 'upload_date': '20150228',
+ 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ }
+ },
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ },
{
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -124,17 +218,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, # m3u8 download
},
},
- # Direct link to a video
- {
- 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
- 'info_dict': {
- 'id': 'trailer',
- 'ext': 'mp4',
- 'title': 'trailer',
- 'upload_date': '20100513',
- }
- },
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -159,22 +242,6 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
- # google redirect
- {
- 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
- 'info_dict': {
- 'id': 'cmQHVoWB5FY',
- 'ext': 'mp4',
- 'upload_date': '20130224',
- 'uploader_id': 'TheVerge',
- 'description': 're:^Chris Ziegler takes a look at the\.*',
- 'uploader': 'The Verge',
- 'title': 'First Firefox OS phones side-by-side',
- },
- 'params': {
- 'skip_download': False,
- }
- },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -224,6 +291,37 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # SportBox embed
+ {
+ 'url': 'http://www.vestifinance.ru/articles/25753',
+ 'info_dict': {
+ 'id': '25753',
+ 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '370908',
+ 'title': 'Госзаказ. День 3',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'info_dict': {
+ 'id': '370905',
+ 'title': 'Госзаказ. День 2',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'info_dict': {
+ 'id': '370902',
+ 'title': 'Госзаказ. День 1',
+ 'ext': 'mp4',
+ }
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
# Embedded TED video
{
'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -375,16 +473,6 @@ class GenericIE(InfoExtractor):
'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
- # RSS feed
- {
- 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'info_dict': {
- 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'title': 'Zero Punctuation',
- 'description': 're:.*groundbreaking video review series.*'
- },
- 'playlist_mincount': 11,
- },
# Multiple brightcove videos
# https://github.com/rg3/youtube-dl/issues/2283
{
@@ -438,21 +526,6 @@ class GenericIE(InfoExtractor):
'uploader': 'thoughtworks.wistia.com',
},
},
- # Direct download with broken HEAD
- {
- 'url': 'http://ai-radio.org:8000/radio.opus',
- 'info_dict': {
- 'id': 'radio',
- 'ext': 'opus',
- 'title': 'radio',
- },
- 'params': {
- 'skip_download': True, # infinite live stream
- },
- 'expected_warnings': [
- r'501.*Not Implemented'
- ],
- },
# Soundcloud embed
{
'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -484,21 +557,6 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 2,
},
- # Direct link with incorrect MIME type
- {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'md5': '4ccbebe5f36706d85221f204d7eb5913',
- 'info_dict': {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'id': '5_Lennart_Poettering_-_Systemd',
- 'ext': 'webm',
- 'title': '5_Lennart_Poettering_-_Systemd',
- 'upload_date': '20141120',
- },
- 'expected_warnings': [
- 'URL could be a direct video link, returning it as such.'
- ]
- },
# Cinchcast embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -657,16 +715,6 @@ class GenericIE(InfoExtractor):
'age_limit': 0,
},
},
- # RSS feed with enclosure
- {
- 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
- 'info_dict': {
- 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- 'ext': 'm4v',
- 'upload_date': '20150228',
- 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- }
- },
# Crooks and Liars embed
{
'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@@ -862,7 +910,7 @@ class GenericIE(InfoExtractor):
force_videoid = smuggled_data['force_videoid']
video_id = force_videoid
else:
- video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+ video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
self.to_screen('%s: Requesting header' % video_id)
@@ -884,7 +932,9 @@ class GenericIE(InfoExtractor):
full_response = None
if head_response is False:
- full_response = self._request_webpage(url, video_id)
+ request = compat_urllib_request.Request(url)
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
head_response = full_response
# Check for direct link to a video
@@ -895,7 +945,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'formats': [{
'format_id': m.group('format_id'),
@@ -909,7 +959,17 @@ class GenericIE(InfoExtractor):
self._downloader.report_warning('Falling back on generic information extractor.')
if not full_response:
- full_response = self._request_webpage(url, video_id)
+ request = compat_urllib_request.Request(url)
+ # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+ # making it impossible to download only chunk of the file (yet we need only 512kB to
+ # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+ # that will always result in downloading the whole file that is not desirable.
+ # Therefore for extraction pass we have to override Accept-Encoding to any in order
+ # to accept raw bytes and being able to download only a chunk.
+ # It may probably better to solve this by checking Content-Type for application/octet-stream
+ # after HEAD request finishes, but not sure if we can rely on this.
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
@@ -921,7 +981,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'url': url,
'upload_date': upload_date,
@@ -1229,6 +1289,11 @@ class GenericIE(InfoExtractor):
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
+ # Look for embedded SportBox player
+ sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+ if sportbox_urls:
+ return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+
# Look for embedded TED player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
@@ -1388,7 +1453,7 @@ class GenericIE(InfoExtractor):
# Look for Senate ISVP iframe
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
if senate_isvp_url:
- return self.url_result(surl, 'SenateISVP')
+ return self.url_result(senate_isvp_url, 'SenateISVP')
def check_video(vurl):
if YoutubeIE.suitable(vurl):
diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py
index fe5d95e2c..d692ea79a 100644
--- a/youtube_dl/extractor/imgur.py
+++ b/youtube_dl/extractor/imgur.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
int_or_none,
js_to_json,
@@ -12,7 +13,7 @@ from ..utils import (
class ImgurIE(InfoExtractor):
- _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
@@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(
+ compat_urlparse.urljoin(url, video_id), video_id)
width = int_or_none(self._search_regex(
r'<param name="width" value="([0-9]+)"',
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 65f6ca103..b10755788 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -7,9 +7,9 @@ from ..utils import int_or_none
class InstagramIE(InfoExtractor):
- _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
+ _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
_TEST = {
- 'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
+ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
@@ -41,11 +41,11 @@ class InstagramIE(InfoExtractor):
class InstagramUserIE(InfoExtractor):
- _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+ _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
_TEST = {
- 'url': 'http://instagram.com/porsche',
+ 'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 8529bedfc..821c8ec10 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -11,11 +11,12 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ remove_end,
)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):
'id': '39152',
'ext': 'flv',
'title': 'Partička (92)',
- 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+ 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
@@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):
'id': '9718337',
'ext': 'flv',
'title': 'Tchibo Partička - Jarní móda',
- 'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
'thumbnail': 're:^http:.*\.jpg$',
},
'params': {
'skip_download': True, # requires rtmpdump
},
- 'skip': 'Do not have permission to access this page',
+ }, {
+ 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):
return {
'id': real_id,
- 'title': self._og_search_title(webpage),
+ 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
- 'description': self._og_search_description(webpage),
+ 'description': self._search_regex(
+ r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+ webpage, 'description', default=None),
}
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 99a1361f8..bc226fa67 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404302298,
+ 'timestamp': int,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163322193,
+ 'timestamp': int,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
uploader = self._html_search_regex(
r"adduserUsername\s*=\s*'([^']+)';",
- webpage, 'uploader', fatal=False, default='')
+ webpage, 'uploader', fatal=False)
timestamp = parse_iso8601(self._html_search_meta(
- 'uploadDate', webpage, 'upload date', fatal=False))
+ 'uploadDate', webpage, 'upload date'))
duration = float_or_none(self._html_search_regex(
r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
# Might be empty for some videos.
streams = self._html_search_regex(
- r'"qualitylevel"\s*:\s*"([^"]+)"',
- webpage, 'streams', fatal=False, default='')
+ r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
formats = []
if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
formats.append({
'format_id': '%sp' % quality if quality else 'sd',
- 'url': url,
+ 'url': compat_urllib_parse_unquote(url),
'ext': ext,
})
else:
stream_url = self._search_regex(
- r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+ r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
formats.append({
'format_id': 'sd',
- 'url': stream_url,
+ 'url': compat_urllib_parse_unquote(stream_url),
'ext': ext,
})
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644
index 000000000..bed94bc93
--- /dev/null
+++ b/youtube_dl/extractor/karrierevideos.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ fix_xml_ampersands,
+ float_or_none,
+ xpath_with_ns,
+ xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+ 'info_dict': {
+ 'id': '32c91',
+ 'ext': 'flv',
+ 'title': 'AltenpflegerIn',
+ 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+ 'thumbnail': 're:^http://.*\.png',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # broken ampersands
+ 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+ 'info_dict': {
+ 'id': '5sniu',
+ 'ext': 'flv',
+ 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+ 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+ 'thumbnail': 're:^http://.*\.png',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = (self._html_search_meta('title', webpage, default=None) or
+ self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+ video_id = self._search_regex(
+ r'/config/video/(.+?)\.xml', webpage, 'video id')
+ playlist = self._download_xml(
+ 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+ video_id, transform_source=fix_xml_ampersands)
+
+ NS_MAP = {
+ 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+ }
+
+ def ns(path):
+ return xpath_with_ns(path, NS_MAP)
+
+ item = playlist.find('./tracklist/item')
+ video_file = xpath_text(
+ item, ns('./jwplayer:file'), 'video url', fatal=True)
+ streamer = xpath_text(
+ item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+ uploader = xpath_text(
+ item, ns('./jwplayer:author'), 'uploader')
+ duration = float_or_none(
+ xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+ description = self._html_search_regex(
+ r'(?s)<div class="leadtext">(.+?)</div>',
+ webpage, 'description')
+
+ thumbnail = self._html_search_meta(
+ 'thumbnail', webpage, 'thumbnail')
+ if thumbnail:
+ thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+ return {
+ 'id': video_id,
+ 'url': streamer.replace('rtmpt', 'rtmp'),
+ 'play_path': 'mp4:%s' % video_file,
+ 'ext': 'flv',
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py
index 1484ac0d2..da896caf1 100644
--- a/youtube_dl/extractor/letv.py
+++ b/youtube_dl/extractor/letv.py
@@ -50,9 +50,7 @@ class LetvIE(InfoExtractor):
'title': '与龙共舞 完整版',
'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
},
- 'params': {
- 'cn_verification_proxy': 'http://proxy.uku.im:8888'
- },
+ 'skip': 'Only available in China',
}]
@staticmethod
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index d8897eb90..7091f3335 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
- 'md5': '6a75fe9d0d3275bead0cb683c616fddb',
'info_dict': {
'id': '0fce117d',
'ext': 'mp4',
@@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor):
'display_id': 'programa-144',
'duration': 2913,
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor):
episode,
transform_source=strip_jsonp
)
+ formats = self._extract_m3u8_formats(
+ token_info['tokenizedUrl'], episode, ext='mp4')
return {
'id': embed_data['videoId'],
'display_id': episode,
'title': info_el.find('title').text,
- 'url': token_info['tokenizedUrl'],
+ 'formats': formats,
'description': get_element_by_attribute('class', 'text', webpage),
'thumbnail': info_el.find('thumb').text,
'duration': parse_duration(info_el.find('duration').text),
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index c10405f04..925967753 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
@@ -16,7 +17,7 @@ from ..utils import (
class NaverIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://tvcast.naver.com/v/81652',
'info_dict': {
'id': '81652',
@@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):
'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
'upload_date': '20130903',
},
- }
+ }, {
+ 'url': 'http://tvcast.naver.com/v/395837',
+ 'md5': '638ed4c12012c458fefcddfd01f173cd',
+ 'info_dict': {
+ 'id': '395837',
+ 'ext': 'mp4',
+ 'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+ 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+ 'upload_date': '20150519',
+ },
+ 'skip': 'Georestricted',
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):
webpage)
if m_id is None:
m_error = re.search(
- r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+ r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
webpage)
if m_error:
raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
@@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
domain = format_el.find('Domain').text
+ uri = format_el.find('uri').text
f = {
- 'url': domain + format_el.find('uri').text,
+ 'url': compat_urlparse.urljoin(domain, uri),
'ext': 'mp4',
'width': int(format_el.find('width').text),
'height': int(format_el.find('height').text),
}
if domain.startswith('rtmp'):
+ # urlparse does not support custom schemes
+ # https://bugs.python.org/issue18828
f.update({
+ 'url': domain + uri,
'ext': 'flv',
'rtmp_protocol': '1', # rtmpt
})
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 862b706bf..944096e1c 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
}, {
'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
'only_matching': True,
+ }, {
+ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+ 'info_dict': {
+ 'id': '0041400301-cle-atl-recap.nba',
+ 'ext': 'mp4',
+ 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+ 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+ 'duration': 228,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
}]
def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
description = self._og_search_description(webpage)
- duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration'))
+ duration_str = self._html_search_meta(
+ 'duration', webpage, 'duration', default=None)
+ if not duration_str:
+ duration_str = self._html_search_regex(
+ r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+ duration = parse_duration(duration_str)
return {
'id': shortened_video_id,
diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py
index 02dba4ef6..d1b7cff4c 100644
--- a/youtube_dl/extractor/nextmedia.py
+++ b/youtube_dl/extractor/nextmedia.py
@@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE):
return self._extract_from_nextmedia_page(news_id, url, article_page)
-class AppleDailyRealtimeNewsIE(NextMediaIE):
- _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+ _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
_TESTS = [{
'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
'ext': 'mp4',
'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'md5:b23787119933404ce515c6356a8c355c',
+ 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
'upload_date': '20150128',
}
}, {
@@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
'ext': 'mp4',
'title': '不滿被踩腳 山東兩大媽一路打下車',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+ 'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
'upload_date': '20150128',
}
- }]
-
- _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
- def _fetch_title(self, page):
- return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
- def _fetch_thumbnail(self, page):
- return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
- def _fetch_timestamp(self, page):
- return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
- _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
- _TESTS = [{
+ }, {
'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
'md5': '03df296d95dedc2d5886debbb80cb43f',
'info_dict': {
@@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
'expected_warnings': [
'video thumbnail',
]
+ }, {
+ 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+ 'only_matching': True,
}]
+ _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
def _fetch_title(self, page):
- return self._html_search_meta('description', page, 'news title')
+ return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+ self._html_search_meta('description', page, 'news title'))
+
+ def _fetch_thumbnail(self, page):
+ return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+ def _fetch_timestamp(self, page):
+ return None
def _fetch_description(self, page):
return self._html_search_meta('description', page, 'news description')
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644
index 000000000..3f9c776ef
--- /dev/null
+++ b/youtube_dl/extractor/nova.py
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+ IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+ _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+ _TESTS = [{
+ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+ 'info_dict': {
+ 'id': '1608920',
+ 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+ 'ext': 'flv',
+ 'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+ 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+ 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+ 'info_dict': {
+ 'id': '1757139',
+ 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+ 'ext': 'mp4',
+ 'title': 'Podzemní nemocnice v pražské Krči',
+ 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'info_dict': {
+ 'id': '1756825',
+ 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'ext': 'flv',
+ 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+ 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+ 'info_dict': {
+ 'id': '1756858',
+ 'ext': 'flv',
+ 'title': 'Televizní noviny - 30. 5. 2015',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ 'upload_date': '20150530',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'info_dict': {
+ 'id': '1753621',
+ 'ext': 'mp4',
+ 'title': 'Zaklínač 3: Divoký hon',
+ 'description': 're:.*Pokud se stejně jako my nemůžete.*',
+ 'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+ 'upload_date': '20150521',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ site = mobj.group('site')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r"(?:media|video_id)\s*:\s*'(\d+)'",
+ r'media=(\d+)',
+ r'id="article_video_(\d+)"',
+ r'id="player_(\d+)"'],
+ webpage, 'video id')
+
+ config_url = self._search_regex(
+ r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+ webpage, 'config url', default=None)
+
+ if not config_url:
+ DEFAULT_SITE_ID = '23000'
+ SITES = {
+ 'tvnoviny': DEFAULT_SITE_ID,
+ 'novaplus': DEFAULT_SITE_ID,
+ 'vymena': DEFAULT_SITE_ID,
+ 'krasna': DEFAULT_SITE_ID,
+ 'fanda': '30',
+ 'tn': '30',
+ 'doma': '30',
+ }
+
+ site_id = self._search_regex(
+ r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+ config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+ % (site_id, video_id))
+
+ config = self._download_json(
+ config_url, display_id,
+ 'Downloading config JSON',
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+ mediafile = config['mediafile']
+ video_url = mediafile['src']
+
+ m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+ if m:
+ formats = [{
+ 'url': m.group('url'),
+ 'app': m.group('app'),
+ 'play_path': m.group('playpath'),
+ 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+ 'ext': 'flv',
+ }]
+ else:
+ formats = [{
+ 'url': video_url,
+ }]
+ self._sort_formats(formats)
+
+ title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+ description = clean_html(self._og_search_description(webpage, default=None))
+ thumbnail = config.get('poster')
+
+ if site == 'novaplus':
+ upload_date = unified_strdate(self._search_regex(
+ r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+ elif site == 'fanda':
+ upload_date = unified_strdate(self._search_regex(
+ r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+ else:
+ upload_date = None
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644
index 000000000..173e46cd8
--- /dev/null
+++ b/youtube_dl/extractor/nowtv.py
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ parse_duration,
+ remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+ _TESTS = [{
+ # rtl
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+ 'info_dict': {
+ 'id': '203519',
+ 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'ext': 'mp4',
+ 'title': 'Die neuen Bauern und eine Hochzeit',
+ 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432580700,
+ 'upload_date': '20150525',
+ 'duration': 2786,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # rtl2
+ 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+ 'info_dict': {
+ 'id': '203481',
+ 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+ 'ext': 'mp4',
+ 'title': 'Berlin - Tag & Nacht (Folge 934)',
+ 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432666800,
+ 'upload_date': '20150526',
+ 'duration': 2641,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # rtlnitro
+ 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+ 'info_dict': {
+ 'id': '165780',
+ 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+ 'ext': 'mp4',
+ 'title': 'Hals- und Beinbruch',
+ 'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432415400,
+ 'upload_date': '20150523',
+ 'duration': 2742,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # superrtl
+ 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+ 'info_dict': {
+ 'id': '99205',
+ 'display_id': 'medicopter-117/angst',
+ 'ext': 'mp4',
+ 'title': 'Angst!',
+ 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1222632900,
+ 'upload_date': '20080928',
+ 'duration': 3025,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # ntv
+ 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+ 'info_dict': {
+ 'id': '203521',
+ 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+ 'ext': 'mp4',
+ 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+ 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432751700,
+ 'upload_date': '20150527',
+ 'duration': 1083,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # vox
+ 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+ 'info_dict': {
+ 'id': '128953',
+ 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+ 'ext': 'mp4',
+ 'title': "Büro-Fall / Chihuahua 'Joel'",
+ 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432408200,
+ 'upload_date': '20150523',
+ 'duration': 3092,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ station = mobj.group('station')
+
+ info = self._download_json(
+ 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id,
+ display_id)
+
+ video_id = compat_str(info['id'])
+
+ files = info['files']
+ if not files:
+ if info.get('geoblocked', False):
+ raise ExtractorError(
+ 'Video %s is not available from your location due to geo restriction' % video_id,
+ expected=True)
+ if not info.get('free', True):
+ raise ExtractorError(
+ 'Video %s is not available for free' % video_id, expected=True)
+
+ f = info.get('format', {})
+ station = f.get('station') or station
+
+ STATIONS = {
+ 'rtl': 'rtlnow',
+ 'rtl2': 'rtl2now',
+ 'vox': 'voxnow',
+ 'nitro': 'rtlnitronow',
+ 'ntv': 'n-tvnow',
+ 'superrtl': 'superrtlnow'
+ }
+
+ formats = []
+ for item in files['items']:
+ item_path = remove_start(item['path'], '/')
+ tbr = int_or_none(item['bitrate'])
+ m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+ m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+ formats.append({
+ 'url': m3u8_url,
+ 'format_id': '%s-%sk' % (item['id'], tbr),
+ 'ext': 'mp4',
+ 'tbr': tbr,
+ })
+ self._sort_formats(formats)
+
+ title = info['title']
+ description = info.get('articleLong') or info.get('articleShort')
+ timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+ duration = parse_duration(info.get('duration'))
+ thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index fbc521d1a..6c7149fe3 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_urllib_parse
from ..utils import (
unified_strdate,
int_or_none,
@@ -11,8 +12,9 @@ from ..utils import (
class OdnoklassnikiIE(InfoExtractor):
- _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
_TESTS = [{
+ # metadata in JSON
'url': 'http://ok.ru/video/20079905452',
'md5': '8e24ad2da6f387948e7a7d44eb8668fe',
'info_dict': {
@@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor):
'ext': 'mp4',
'title': 'Культура меняет нас (прекрасный ролик!))',
'duration': 100,
- 'upload_date': '20141207',
'uploader_id': '330537914540',
'uploader': 'Виталий Добровольский',
'like_count': int,
- 'age_limit': 0,
+ },
+ }, {
+ # metadataUrl
+ 'url': 'http://ok.ru/video/63567059965189-0',
+ 'md5': '9676cf86eff5391d35dea675d224e131',
+ 'info_dict': {
+ 'id': '63567059965189-0',
+ 'ext': 'mp4',
+ 'title': 'Девушка без комплексов ...',
+ 'duration': 191,
+ 'uploader_id': '534380003155',
+ 'uploader': 'Андрей Мещанинов',
+ 'like_count': int,
},
}, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
@@ -34,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(
+ 'http://ok.ru/video/%s' % video_id, video_id)
player = self._parse_json(
unescapeHTML(self._search_regex(
r'data-attributes="([^"]+)"', webpage, 'player')),
video_id)
- metadata = self._parse_json(player['flashvars']['metadata'], video_id)
+ flashvars = player['flashvars']
+
+ metadata = flashvars.get('metadata')
+ if metadata:
+ metadata = self._parse_json(metadata, video_id)
+ else:
+ metadata = self._download_json(
+ compat_urllib_parse.unquote(flashvars['metadataUrl']),
+ video_id, 'Downloading metadata JSON')
movie = metadata['movie']
title = movie['title']
@@ -53,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor):
uploader = author.get('name')
upload_date = unified_strdate(self._html_search_meta(
- 'ya:ovs:upload_date', webpage, 'upload date'))
+ 'ya:ovs:upload_date', webpage, 'upload date', default=None))
age_limit = None
adult = self._html_search_meta(
- 'ya:ovs:adult', webpage, 'age limit')
+ 'ya:ovs:adult', webpage, 'age limit', default=None)
if adult:
age_limit = 18 if adult == 'true' else 0
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index c0e6d643d..a262a9f6d 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -12,50 +12,7 @@ from ..utils import (
)
-class OoyalaIE(InfoExtractor):
- _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
-
- _TESTS = [
- {
- # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
- 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'info_dict': {
- 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'ext': 'mp4',
- 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
- 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
- },
- }, {
- # Only available for ipad
- 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'info_dict': {
- 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'ext': 'mp4',
- 'title': 'Simulation Overview - Levels of Simulation',
- 'description': '',
- },
- },
- {
- # Information available only through SAS api
- # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
- 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
- 'md5': 'a84001441b35ea492bc03736e59e7935',
- 'info_dict': {
- 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
- 'ext': 'mp4',
- 'title': 'Ooyala video',
- }
- }
- ]
-
- @staticmethod
- def _url_for_embed_code(embed_code):
- return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
-
- @classmethod
- def _build_url_result(cls, embed_code):
- return cls.url_result(cls._url_for_embed_code(embed_code),
- ie=cls.ie_key())
+class OoyalaBaseIE(InfoExtractor):
def _extract_result(self, info, more_info):
embedCode = info['embedCode']
@@ -77,11 +34,8 @@ class OoyalaIE(InfoExtractor):
'thumbnail': more_info['promo'],
}
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- embedCode = mobj.group('id')
- player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
- player = self._download_webpage(player_url, embedCode)
+ def _extract(self, player_url, video_id):
+ player = self._download_webpage(player_url, video_id)
mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
player, 'mobile player url')
# Looks like some videos are only available for particular devices
@@ -94,7 +48,7 @@ class OoyalaIE(InfoExtractor):
devices.insert(0, 'unknown')
for device in devices:
mobile_player = self._download_webpage(
- '%s&device=%s' % (mobile_url, device), embedCode,
+ '%s&device=%s' % (mobile_url, device), video_id,
'Downloading mobile player JS for %s device' % device)
videos_info = self._search_regex(
r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
@@ -105,10 +59,10 @@ class OoyalaIE(InfoExtractor):
if not videos_info:
formats = []
auth_data = self._download_json(
- 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode),
- embedCode)
+ 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id),
+ video_id)
- cur_auth_data = auth_data['authorization_data'][embedCode]
+ cur_auth_data = auth_data['authorization_data'][video_id]
for stream in cur_auth_data['streams']:
formats.append({
@@ -123,7 +77,7 @@ class OoyalaIE(InfoExtractor):
})
if formats:
return {
- 'id': embedCode,
+ 'id': video_id,
'formats': formats,
'title': 'Ooyala video',
}
@@ -143,9 +97,100 @@ class OoyalaIE(InfoExtractor):
videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
return {
'_type': 'playlist',
- 'id': embedCode,
+ 'id': video_id,
'title': unescapeHTML(videos_more_info['title']),
'entries': videos,
}
else:
return self._extract_result(videos_info[0], videos_more_info)
+
+
+class OoyalaIE(OoyalaBaseIE):
+ _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
+
+ _TESTS = [
+ {
+ # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
+ 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'info_dict': {
+ 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'ext': 'mp4',
+ 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+ 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+ },
+ }, {
+ # Only available for ipad
+ 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+ 'info_dict': {
+ 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+ 'ext': 'mp4',
+ 'title': 'Simulation Overview - Levels of Simulation',
+ 'description': '',
+ },
+ },
+ {
+ # Information available only through SAS api
+ # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
+ 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+ 'md5': 'a84001441b35ea492bc03736e59e7935',
+ 'info_dict': {
+ 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+ 'ext': 'mp4',
+ 'title': 'Ooyala video',
+ }
+ }
+ ]
+
+ @staticmethod
+ def _url_for_embed_code(embed_code):
+ return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+
+ @classmethod
+ def _build_url_result(cls, embed_code):
+ return cls.url_result(cls._url_for_embed_code(embed_code),
+ ie=cls.ie_key())
+
+ def _real_extract(self, url):
+ embed_code = self._match_id(url)
+ player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+ return self._extract(player_url, embed_code)
+
+
+class OoyalaExternalIE(OoyalaBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ ooyalaexternal:|
+ https?://.+?\.ooyala\.com/.*?\bexternalId=
+ )
+ (?P<partner_id>[^:]+)
+ :
+ (?P<id>.+)
+ (?:
+ :|
+ .*?&pcode=
+ )
+ (?P<pcode>.+?)
+ (&|$)
+ '''
+
+ _TEST = {
+ 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always',
+ 'info_dict': {
+ 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+ 'ext': 'mp4',
+ 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+ 'description': '',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ partner_id = mobj.group('partner_id')
+ video_id = mobj.group('id')
+ pcode = mobj.group('pcode')
+ player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode)
+ return self._extract(player_url, video_id)
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index f179ea200..6cdc2638b 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
embed = self._html_search_regex(
- r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+ r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
webpage, 'embedded URL', default=None)
if attach_fn is not None:
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644
index 000000000..72d1b2718
--- /dev/null
+++ b/youtube_dl/extractor/porn91.py
@@ -0,0 +1,71 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+ ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+ IE_NAME = '91porn'
+ _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+ _TEST = {
+ 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+ 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'info_dict': {
+ 'id': '7e42283b4f5ab36da134',
+ 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+ 'ext': 'mp4',
+ 'duration': 431,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+ self._set_cookie('91porn.com', 'language', 'cn_CN')
+ webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+ if '作为游客,你每天只可观看10个视频' in webpage:
+ raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+ title = self._search_regex(
+ r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+ title = title.replace('\n', '')
+
+ # get real url
+ file_id = self._search_regex(
+ r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+ sec_code = self._search_regex(
+ r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+ max_vid = self._search_regex(
+ r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+ url_params = compat_urllib_parse.urlencode({
+ 'VID': file_id,
+ 'mp4': '1',
+ 'seccode': sec_code,
+ 'max_vid': max_vid,
+ })
+ info_cn = self._download_webpage(
+ 'http://91porn.com/getfile.php?' + url_params, video_id,
+ 'get real video url')
+ video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+ duration = parse_duration(self._search_regex(
+ r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+ comment_count = int_or_none(self._search_regex(
+ r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'comment_count': comment_count,
+ }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 0c8b731cf..daa284ea2 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -71,7 +71,8 @@ class PornHubIE(InfoExtractor):
video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
if webpage.find('"encrypted":true') != -1:
- password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+ password = compat_urllib_parse.unquote_plus(
+ self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
formats = []
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py
index 9688ed948..eba4dfbb3 100644
--- a/youtube_dl/extractor/pornovoisines.py
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):
'duration': 120,
'view_count': int,
'average_rating': float,
- 'categories': ['Débutante', 'Scénario', 'Sodomie'],
+ 'categories': ['Débutantes', 'Scénario', 'Sodomie'],
'age_limit': 18,
}
}
@@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):
view_count = int_or_none(self._search_regex(
r'(\d+) vues', webpage, 'view count', fatal=False))
average_rating = self._search_regex(
- r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False)
+ r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
if average_rating:
average_rating = float_or_none(average_rating.replace(',', '.'))
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 7cc799664..255d4abc1 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -17,7 +17,7 @@ from ..utils import (
class ProSiebenSat1IE(InfoExtractor):
IE_NAME = 'prosiebensat1'
IE_DESC = 'ProSiebenSat.1 Digital'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
_TESTS = [
{
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 13113820b..bafa81c21 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -9,7 +9,6 @@ from .common import InfoExtractor
from ..utils import (
strip_jsonp,
unescapeHTML,
- js_to_json,
)
from ..compat import compat_urllib_request
@@ -19,17 +18,23 @@ class QQMusicIE(InfoExtractor):
_VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
- 'md5': 'bed90b6db2a7a7a7e11bc585f471f63a',
+ 'md5': '9ce1c1c8445f561506d2e3cfb0255705',
'info_dict': {
'id': '004295Et37taLD',
- 'ext': 'm4a',
+ 'ext': 'mp3',
'title': '可惜没如果',
'upload_date': '20141227',
'creator': '林俊杰',
- 'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',
+ 'description': 'md5:d327722d0361576fde558f1ac68a7065',
}
}]
+ _FORMATS = {
+ 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+ 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+ 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+ }
+
# Reference: m_r_GetRUin() in top_player.js
# http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
@staticmethod
@@ -60,6 +65,8 @@ class QQMusicIE(InfoExtractor):
lrc_content = self._html_search_regex(
r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',
detail_info_page, 'LRC lyrics', default=None)
+ if lrc_content:
+ lrc_content = lrc_content.replace('\\n', '\n')
guid = self.m_r_get_ruin()
@@ -67,11 +74,22 @@ class QQMusicIE(InfoExtractor):
'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
mid, note='Retrieve vkey', errnote='Unable to get vkey',
transform_source=strip_jsonp)['key']
- song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid)
+
+ formats = []
+ for format_id, details in self._FORMATS.items():
+ formats.append({
+ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+ % (details['prefix'], mid, details['ext'], vkey, guid),
+ 'format': format_id,
+ 'format_id': format_id,
+ 'preference': details['preference'],
+ 'abr': details.get('abr'),
+ })
+ self._sort_formats(formats)
return {
'id': mid,
- 'url': song_url,
+ 'formats': formats,
'title': song_name,
'upload_date': publish_time,
'creator': singer,
@@ -179,60 +197,49 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
_VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=toplist&p=global_12',
+ 'url': 'http://y.qq.com/#type=toplist&p=global_123',
'info_dict': {
- 'id': 'global_12',
- 'title': 'itunes榜',
+ 'id': 'global_123',
+ 'title': '美国iTunes榜',
},
'playlist_count': 10,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=top_6',
+ 'url': 'http://y.qq.com/#type=toplist&p=top_3',
'info_dict': {
- 'id': 'top_6',
+ 'id': 'top_3',
'title': 'QQ音乐巅峰榜·欧美',
+ 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+ '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+ '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+ '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
},
'playlist_count': 100,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=global_5',
+ 'url': 'http://y.qq.com/#type=toplist&p=global_106',
'info_dict': {
- 'id': 'global_5',
- 'title': '韩国mnet排行榜',
+ 'id': 'global_106',
+ 'title': '韩国Mnet榜',
},
'playlist_count': 50,
}]
- @staticmethod
- def strip_qq_jsonp(code):
- return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code))
-
def _real_extract(self, url):
list_id = self._match_id(url)
list_type, num_id = list_id.split("_")
- list_page = self._download_webpage(
- "http://y.qq.com/y/static/toplist/index/%s.html" % list_id,
- list_id, 'Download toplist page')
-
- entries = []
- if list_type == 'top':
- jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id
- else:
- jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id
-
toplist_json = self._download_json(
- jsonp_url, list_id, note='Retrieve toplist json',
- errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp)
-
- for song in toplist_json['l']:
- s = song['s']
- song_mid = s.split("|")[20]
- entries.append(self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
- song_mid))
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+ % (list_type, num_id),
+ list_id, 'Download toplist page')
- list_name = self._html_search_regex(
- r'<h2 id="top_name">([^\']+)</h2>', list_page, 'top list name',
- default=None)
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+ ) for song in toplist_json['songlist']
+ ]
- return self.playlist_result(entries, list_id, list_name)
+ topinfo = toplist_json.get('topinfo', {})
+ list_name = topinfo.get('ListName')
+ list_description = topinfo.get('info')
+ return self.playlist_result(entries, list_id, list_name, list_description)
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
index dce64e151..5a381d9ce 100644
--- a/youtube_dl/extractor/rtbf.py
+++ b/youtube_dl/extractor/rtbf.py
@@ -1,10 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unescapeHTML,
+)
class RTBFIE(InfoExtractor):
@@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor):
'id': '1921274',
'ext': 'mp4',
'title': 'Les Diables au coeur (épisode 2)',
- 'description': 'Football - Diables Rouges',
'duration': 3099,
- 'timestamp': 1398456336,
- 'upload_date': '20140425',
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+ webpage = self._download_webpage(
+ 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
- data = json.loads(self._html_search_regex(
- r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+ data = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'data-video="([^"]+)"', webpage, 'data video')),
+ video_id)
video_url = data.get('downloadUrl') or data.get('url')
- if data['provider'].lower() == 'youtube':
+ if data.get('provider').lower() == 'youtube':
return self.url_result(video_url, 'Youtube')
return {
@@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor):
'url': video_url,
'title': data['title'],
'description': data.get('description') or data.get('subtitle'),
- 'thumbnail': data['thumbnail']['large'],
+ 'thumbnail': data.get('thumbnail'),
'duration': data.get('duration') or data.get('realDuration'),
- 'timestamp': data['created'],
- 'view_count': data['viewCount'],
+ 'timestamp': int_or_none(data.get('created')),
+ 'view_count': int_or_none(data.get('viewCount')),
}
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644
index 785a8045e..000000000
--- a/youtube_dl/extractor/rtlnow.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- clean_html,
- unified_strdate,
- int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
- """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'''(?x)
- (?:https?://)?
- (?P<url>
- (?P<domain>
- rtl-now\.rtl\.de|
- rtl2now\.rtl2\.de|
- (?:www\.)?voxnow\.de|
- (?:www\.)?rtlnitronow\.de|
- (?:www\.)?superrtlnow\.de|
- (?:www\.)?n-tvnow\.de)
- /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
- (?:container_id|film_id)=(?P<video_id>[0-9]+)&
- player=1(?:&season=[0-9]+)?(?:&.*)?
- )'''
-
- _TESTS = [
- {
- 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
- 'info_dict': {
- 'id': '90419',
- 'ext': 'flv',
- 'title': 'Ahornallee - Folge 1 - Der Einzug',
- 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
- 'upload_date': '20070416',
- 'duration': 1685,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
- 'info_dict': {
- 'id': '69756',
- 'ext': 'flv',
- 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
- 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
- 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
- 'upload_date': '20120519',
- 'duration': 1245,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
- 'info_dict': {
- 'id': '13883',
- 'ext': 'flv',
- 'title': 'Voxtours - Südafrika-Reporter II',
- 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
- 'upload_date': '20090627',
- 'duration': 1800,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
- 'info_dict': {
- 'id': '99205',
- 'ext': 'flv',
- 'title': 'Medicopter 117 - Angst!',
- 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
- 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
- 'upload_date': '20080928',
- 'duration': 2691,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
- 'info_dict': {
- 'id': '188729',
- 'ext': 'flv',
- 'upload_date': '20150204',
- 'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
- 'title': 'Der Bachelor - Folge 4',
- }
- }, {
- 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
- 'only_matching': True,
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_page_url = 'http://%s/' % mobj.group('domain')
- video_id = mobj.group('video_id')
-
- webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
- mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
- if mobj:
- raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage, default=None)
-
- upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
- mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
- duration = int(mobj.group('seconds')) if mobj else None
-
- playerdata_url = self._html_search_regex(
- r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
- playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
- videoinfo = playerdata.find('./playlist/videoinfo')
-
- formats = []
- for filename in videoinfo.findall('filename'):
- mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
- if mobj:
- fmt = {
- 'url': mobj.group('url'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': video_page_url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
- if mobj:
- fmt = {
- 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- fmt = {
- 'url': filename.text,
- }
- fmt.update({
- 'width': int_or_none(filename.get('width')),
- 'height': int_or_none(filename.get('height')),
- 'vbr': int_or_none(filename.get('bitrate')),
- 'ext': 'flv',
- })
- formats.append(fmt)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index d0981115d..9fbe239d8 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -190,6 +190,7 @@ class RTSIE(InfoExtractor):
'tbr': media['rate'] or extract_bitrate(media['url']),
} for media in info['media'] if media.get('rate')])
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 849300140..82cd98ac7 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -17,7 +17,7 @@ from ..utils import (
def _decrypt_url(png):
- encrypted_data = base64.b64decode(png)
+ encrypted_data = base64.b64decode(png.encode('utf-8'))
text_index = encrypted_data.find(b'tEXt')
text_chunk = encrypted_data[text_index - 4:]
length = struct_unpack('!I', text_chunk[:4])[0]
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
index 55604637d..d9df06861 100644
--- a/youtube_dl/extractor/rutv.py
+++ b/youtube_dl/extractor/rutv.py
@@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index b8775c2f9..d4bd1a0d7 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
from ..utils import (
@@ -33,16 +32,18 @@ class SBSIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
- release_urls_json = js_to_json(self._search_regex(
+ player = self._search_regex(
r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n',
- webpage, ''))
- release_urls = json.loads(release_urls_json)
- theplatform_url = (
- release_urls.get('progressive') or release_urls.get('standard'))
+ webpage, 'player')
+ player = re.sub(r"'\s*\+\s*[\da-zA-Z_]+\s*\+\s*'", '', player)
+
+ release_urls = self._parse_json(js_to_json(player), video_id)
+
+ theplatform_url = release_urls.get('progressive') or release_urls['standard']
title = remove_end(self._og_search_title(webpage), ' (The Feed)')
description = self._html_search_meta('description', webpage)
@@ -52,7 +53,6 @@ class SBSIE(InfoExtractor):
'_type': 'url_transparent',
'id': video_id,
'url': theplatform_url,
-
'title': title,
'description': description,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index d3b8a1be4..9c53704ea 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
["arch", "", "http://ussenate-f.akamaihd.net/"]
]
_IE_NAME = 'senate.gov'
- _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+ _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
'info_dict': {
@@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):
'ext': 'mp4',
'title': 'Integrated Senate Video Player'
}
+ }, {
+ # From http://www.c-span.org/video/?96791-1
+ 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+ 'only_matching': True,
}]
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
- r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
+ r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index 26ced716e..9f3e944e7 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -47,7 +47,7 @@ class SharedIE(InfoExtractor):
video_url = self._html_search_regex(
r'data-url="([^"]+)"', video_page, 'video URL')
title = base64.b64decode(self._html_search_meta(
- 'full:title', webpage, 'title')).decode('utf-8')
+ 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
filesize = int_or_none(self._html_search_meta(
'full:size', webpage, 'file size', fatal=False))
thumbnail = self._html_search_regex(
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644
index b5fa6f1da..000000000
--- a/youtube_dl/extractor/sockshare.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- determine_ext,
- ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
- _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
- _TEST = {
- 'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
- 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
- 'info_dict': {
- 'id': '437BE28B89D799D7',
- 'title': 'big_buck_bunny_720p_surround.avi',
- 'ext': 'avi',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://sockshare.com/file/%s' % video_id
- webpage = self._download_webpage(url, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- confirm_hash = self._html_search_regex(r'''(?x)<input\s+
- type="hidden"\s+
- value="([^"]*)"\s+
- name="hash"
- ''', webpage, 'hash')
-
- fields = {
- "hash": confirm_hash.encode('utf-8'),
- "confirm": "Continue as Free User"
- }
-
- post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
- # Apparently, this header is required for confirmation to work.
- req.add_header('Host', 'www.sockshare.com')
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- webpage = self._download_webpage(
- req, video_id, 'Downloading video page')
-
- video_url = self._html_search_regex(
- r'<a href="([^"]*)".+class="download_file_link"',
- webpage, 'file url')
- video_url = "http://www.sockshare.com" + video_url
- title = self._html_search_regex((
- r'<h1>(.+)<strong>',
- r'var name = "([^"]+)";'),
- webpage, 'title', default=None)
- thumbnail = self._html_search_regex(
- r'<img\s+src="([^"]*)".+?name="bg"',
- webpage, 'thumbnail', default=None)
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'ext': determine_ext(title),
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index eab4adfca..29bd9ce6f 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -23,9 +23,7 @@ class SohuIE(InfoExtractor):
'ext': 'mp4',
'title': 'MV:Far East Movement《The Illest》',
},
- 'params': {
- 'cn_verification_proxy': 'proxy.uku.im:8888'
- }
+ 'skip': 'On available in China',
}, {
'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a',
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644
index 000000000..5da66ca9e
--- /dev/null
+++ b/youtube_dl/extractor/soompi.py
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ remove_start,
+ xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+ def _get_episodes(self, webpage, episode_filter=None):
+ episodes = self._parse_json(
+ self._search_regex(
+ r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+ None)
+ return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+ IE_NAME = 'soompi'
+ _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/watch/29235',
+ 'info_dict': {
+ 'id': '29235',
+ 'ext': 'mp4',
+ 'title': 'Episode 1096',
+ 'description': '2015-05-20'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _get_episode(self, webpage, video_id):
+ return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+ def _get_subtitles(self, config, video_id):
+ sub_langs = {}
+ for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+ sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+ subtitles = {}
+ for s in config.findall('./{default}preload/subtitle'):
+ lang_code = sub_langs.get(s.attrib['id'])
+ if not lang_code:
+ continue
+ sub_id = s.get('id')
+ data = xpath_text(s, './data', 'data')
+ iv = xpath_text(s, './iv', 'iv')
+ if not id or not iv or not data:
+ continue
+ subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ try:
+ webpage = self._download_webpage(
+ url, video_id, 'Downloading episode page')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ webpage = ee.cause.read()
+ block_message = self._html_search_regex(
+ r'(?s)<div class="block-message">(.+?)</div>', webpage,
+ 'block message', default=None)
+ if block_message:
+ raise ExtractorError(block_message, expected=True)
+ raise
+
+ formats = []
+ config = None
+ for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+ config = self._download_xml(
+ 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+ video_id, 'Downloading %s XML' % format_id)
+ m3u8_url = xpath_text(
+ config, './{default}preload/stream_info/file',
+ '%s m3u8 URL' % format_id)
+ if not m3u8_url:
+ continue
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+ self._sort_formats(formats)
+
+ episode = self._get_episode(webpage, video_id)
+
+ title = episode['name']
+ description = episode.get('description')
+ duration = int_or_none(episode.get('duration'))
+
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+ subtitles = self.extract_subtitles(config, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class SoompiShowIE(SoompiBaseIE):
+ IE_NAME = 'soompi:show'
+ _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/shows/liar-game',
+ 'info_dict': {
+ 'id': 'liar-game',
+ 'title': 'Liar Game',
+ 'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+ },
+ 'playlist_count': 14,
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ url, show_id, 'Downloading show page')
+
+ title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+ description = self._og_search_description(webpage)
+
+ entries = [
+ self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+ for episode in self._get_episodes(webpage)]
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index b936202f6..06d6e6640 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -71,7 +71,7 @@ class SpankwireIE(InfoExtractor):
compat_urllib_parse.unquote,
re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
- password = self._html_search_regex(
+ password = self._search_regex(
r'flashvars\.video_title = "([^"]+)',
webpage, 'password').replace('+', ' ')
video_urls = list(map(
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index 98cf92d89..359722ad6 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -51,9 +51,9 @@ class SpiegeltvIE(InfoExtractor):
is_wide = media_json['is_wide']
server_json = self._download_json(
- 'http://www.spiegel.tv/streaming_servers/', video_id,
- note='Downloading server information')
- server = server_json[0]['endpoint']
+ 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+ video_id, note='Downloading server information')
+ server = server_json['streamingserver'][0]['endpoint']
thumbnails = []
for image in media_json['images']:
@@ -76,5 +76,6 @@ class SpiegeltvIE(InfoExtractor):
'ext': 'm4v',
'description': description,
'duration': duration,
- 'thumbnails': thumbnails
+ 'thumbnails': thumbnails,
+ 'rtmp_live': True,
}
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index becdf658f..86d509ae5 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -4,37 +4,36 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
- parse_duration,
- parse_iso8601,
+ unified_strdate,
)
class SportBoxIE(InfoExtractor):
- _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
- _TESTS = [
- {
- 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
- 'md5': 'ff56a598c2cf411a9a38a69709e97079',
- 'info_dict': {
- 'id': '80822',
- 'ext': 'mp4',
- 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
- 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'timestamp': 1411896237,
- 'upload_date': '20140928',
- 'duration': 4846,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
- 'only_matching': True,
- }
- ]
+ _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+ _TESTS = [{
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
+ 'md5': 'ff56a598c2cf411a9a38a69709e97079',
+ 'info_dict': {
+ 'id': '80822',
+ 'ext': 'mp4',
+ 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
+ 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140928',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+ player = self._search_regex(
+ r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
+
+ title = self._html_search_regex(
+ [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+ webpage, 'title')
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._html_search_meta(
+ 'dateCreated', webpage, 'upload date'))
+
+ return {
+ '_type': 'url_transparent',
+ 'url': compat_urlparse.urljoin(url, '/%s' % player),
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
- player = self._download_webpage(
- 'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
- display_id, 'Downloading player webpage')
+
+class SportBoxEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
+ 'info_dict': {
+ 'id': '211355',
+ 'ext': 'mp4',
+ 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
+ webpage)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
hls = self._search_regex(
- r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
+ r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
+ webpage, 'hls file')
- formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
+ formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
- title = self._html_search_regex(
- r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
- description = self._html_search_regex(
- r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
- timestamp = parse_iso8601(self._search_regex(
- r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
- duration = parse_duration(self._html_search_regex(
- r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+ title = self._search_regex(
+ r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
+
+ thumbnail = self._search_regex(
+ r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
+ webpage, 'thumbnail', default=None)
return {
'id': video_id,
- 'display_id': display_id,
'title': title,
- 'description': description,
'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
'formats': formats,
}
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index 56be52638..d1b7264b4 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -10,6 +10,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
qualities,
+ determine_ext,
)
from ..compat import compat_ord
@@ -50,6 +51,17 @@ class TeamcocoIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 downloads
}
+ }, {
+ 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+ 'info_dict': {
+ 'id': '89341',
+ 'ext': 'mp4',
+ 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 downloads
+ }
}
]
_VIDEO_ID_REGEXES = (
@@ -108,10 +120,24 @@ class TeamcocoIE(InfoExtractor):
formats = []
get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
for filed in data['files']:
- if filed['type'] == 'hls':
- formats.extend(self._extract_m3u8_formats(
- filed['url'], video_id, ext='mp4'))
+ if determine_ext(filed['url']) == 'm3u8':
+ # compat_urllib_parse.urljoin does not work here
+ if filed['url'].startswith('/'):
+ m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+ else:
+ m3u8_url = filed['url']
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4')
+ for m3u8_format in m3u8_formats:
+ if m3u8_format not in formats:
+ formats.append(m3u8_format)
+ elif determine_ext(filed['url']) == 'f4m':
+ # TODO Correct f4m extraction
+ continue
else:
+ if filed['url'].startswith('/mp4:protected/'):
+ # TODO Correct extraction for these files
+ continue
m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
if m_format is not None:
format_id = m_format.group(1)
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index 251a68680..a0c744fd1 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):
'title': 'Con Martín Berasategui, hacer un bacalao al ...',
'duration': 662,
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
'only_matching': True,
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
index 466155ef8..f6694149b 100644
--- a/youtube_dl/extractor/tenplay.py
+++ b/youtube_dl/extractor/tenplay.py
@@ -2,6 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ float_or_none,
+)
class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
if protocol == 'rtmp':
url = url.replace('&mp4:', '')
+ tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
formats.append({
- 'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
- 'width': rendition['frameWidth'],
- 'height': rendition['frameHeight'],
- 'tbr': rendition['encodingRate'] / 1024,
- 'filesize': rendition['size'],
+ 'format_id': '_'.join(
+ ['rtmp', rendition['videoContainer'].lower(),
+ rendition['videoCodec'].lower(), '%sk' % tbr]),
+ 'width': int_or_none(rendition['frameWidth']),
+ 'height': int_or_none(rendition['frameHeight']),
+ 'tbr': tbr,
+ 'filesize': int_or_none(rendition['size']),
'protocol': protocol,
'ext': ext,
'vcodec': rendition['videoCodec'].lower(),
'container': rendition['videoContainer'].lower(),
'url': url,
})
+ self._sort_formats(formats)
return {
'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
'url': json['thumbnailURL']
}],
'thumbnail': json['videoStillURL'],
- 'duration': json['length'] / 1000,
- 'timestamp': float(json['creationDate']) / 1000,
- 'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
- 'view_count': json['playsTotal']
+ 'duration': float_or_none(json.get('length'), 1000),
+ 'timestamp': float_or_none(json.get('creationDate'), 1000),
+ 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+ 'view_count': int_or_none(json.get('playsTotal')),
}
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 025d0877c..3a68eaa80 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
- _TESTS = {
+ _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+ _TESTS = [{
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
'id': '10635995',
@@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index d48cbbf14..c282865b2 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -10,26 +10,32 @@ from ..utils import (
class TNAFlixIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
_TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
_DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
_CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
- _TEST = {
- 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
- 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
- 'info_dict': {
- 'id': '553878',
- 'display_id': 'Carmella-Decesare-striptease',
- 'ext': 'mp4',
- 'title': 'Carmella Decesare - striptease',
- 'description': '',
- 'thumbnail': 're:https?://.*\.jpg$',
- 'duration': 91,
- 'age_limit': 18,
+ _TESTS = [
+ {
+ 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+ 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+ 'info_dict': {
+ 'id': '553878',
+ 'display_id': 'Carmella-Decesare-striptease',
+ 'ext': 'mp4',
+ 'title': 'Carmella Decesare - striptease',
+ 'description': '',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 91,
+ 'age_limit': 18,
+ }
+ },
+ {
+ 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+ 'only_matching': True,
}
- }
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -45,9 +51,8 @@ class TNAFlixIE(InfoExtractor):
age_limit = self._rta_search(webpage)
- duration = self._html_search_meta('duration', webpage, 'duration', default=None)
- if duration:
- duration = parse_duration(duration[1:])
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', default=None))
cfg_url = self._proto_relative_url(self._html_search_regex(
self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
@@ -56,14 +61,15 @@ class TNAFlixIE(InfoExtractor):
cfg_url, display_id, note='Downloading metadata',
transform_source=fix_xml_ampersands)
- thumbnail = cfg_xml.find('./startThumb').text
+ thumbnail = self._proto_relative_url(
+ cfg_xml.find('./startThumb').text, 'http:')
formats = []
for item in cfg_xml.findall('./quality/item'):
video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
format_id = item.find('res').text
fmt = {
- 'url': video_url,
+ 'url': self._proto_relative_url(video_url, 'http:'),
'format_id': format_id,
}
m = re.search(r'^(\d+)', format_id)
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index d73ad3762..6ca8840b0 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
webpage = self._download_webpage(req, display_id)
flashvars = json.loads(self._html_search_regex(
- r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+ r'flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
video_url = flashvars['video_url']
if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
thumbnail = flashvars.get('image_url')
title = self._html_search_regex(
- r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+ r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
description = self._html_search_regex(
- r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+ r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
uploader = self._html_search_regex(
- r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+ r'<span class="username">\s*(.+?)\s*<',
webpage, 'uploader', fatal=False)
like_count = int_or_none(self._html_search_regex(
- r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+ r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
dislike_count = int_or_none(self._html_search_regex(
- r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+ r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
view_count = self._html_search_regex(
- r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+ r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644
index 000000000..2c4b21807
--- /dev/null
+++ b/youtube_dl/extractor/tubitv.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+ _LOGIN_URL = 'http://tubitv.com/login'
+ _NETRC_MACHINE = 'tubitv'
+ _TEST = {
+ 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+ 'info_dict': {
+ 'id': '54411',
+ 'ext': 'mp4',
+ 'title': 'The Kitchen Musical - EP01',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'description': 'md5:37532716166069b353e8866e71fefae7',
+ 'duration': 2407,
+ },
+ 'params': {
+ 'skip_download': 'HLS download',
+ },
+ }
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ form_data = {
+ 'username': username,
+ 'password': password,
+ }
+ payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+ request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ login_page = self._download_webpage(
+ request, None, False, 'Wrong login info')
+ if not re.search(r'id="tubi-logout"', login_page):
+ raise ExtractorError(
+ 'Login failed (invalid username/password)', expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+ raise ExtractorError(
+ 'This video requires login, use --username and --password '
+ 'options to provide account credentials.', expected=True)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+ m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 828c808a6..e6218808f 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -28,6 +28,17 @@ class TumblrIE(InfoExtractor):
'description': 'md5:dba62ac8639482759c8eb10ce474586a',
'thumbnail': 're:http://.*\.jpg',
}
+ }, {
+ 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+ 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+ 'info_dict': {
+ 'id': 'Wmur',
+ 'ext': 'mp4',
+ 'title': 'naked smoking & stretching',
+ 'upload_date': '20150506',
+ 'timestamp': 1430931613,
+ },
+ 'add_ie': ['Vidme'],
}]
def _real_extract(self, url):
@@ -38,6 +49,12 @@ class TumblrIE(InfoExtractor):
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
webpage = self._download_webpage(url, video_id)
+ vid_me_embed_url = self._search_regex(
+ r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+ webpage, 'vid.me embed', default=None)
+ if vid_me_embed_url is not None:
+ return self.url_result(vid_me_embed_url, 'Vidme')
+
iframe_url = self._search_regex(
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
webpage, 'iframe url')
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index 4de0aac52..fad720b68 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
data_content = self._download_webpage(
'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
- video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+ video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
return {
'id': internal_id,
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
new file mode 100644
index 000000000..fa338b936
--- /dev/null
+++ b/youtube_dl/extractor/tv2.py
@@ -0,0 +1,126 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ float_or_none,
+ parse_iso8601,
+ remove_end,
+)
+
+
+class TV2IE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.tv2.no/v/916509/',
+ 'md5': '9cb9e3410b18b515d71892f27856e9b1',
+ 'info_dict': {
+ 'id': '916509',
+ 'ext': 'flv',
+ 'title': 'Se Gryttens hyllest av Steven Gerrard',
+ 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
+ 'timestamp': 1431715610,
+ 'upload_date': '20150515',
+ 'duration': 156.967,
+ 'view_count': int,
+ 'categories': list,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats = []
+ format_urls = []
+ for protocol in ('HDS', 'HLS'):
+ data = self._download_json(
+ 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol),
+ video_id, 'Downloading play JSON')['playback']
+ for item in data['items']['item']:
+ video_url = item.get('url')
+ if not video_url or video_url in format_urls:
+ continue
+ format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+ if not self._is_valid_url(video_url, video_id, format_id):
+ continue
+ format_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id=format_id))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=format_id))
+ elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+ pass
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'tbr': int_or_none(item.get('bitrate')),
+ 'filesize': int_or_none(item.get('fileSize')),
+ })
+ self._sort_formats(formats)
+
+ asset = self._download_json(
+ 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id,
+ video_id, 'Downloading metadata JSON')['asset']
+
+ title = asset['title']
+ description = asset.get('description')
+ timestamp = parse_iso8601(asset.get('createTime'))
+ duration = float_or_none(asset.get('accurateDuration') or asset.get('duration'))
+ view_count = int_or_none(asset.get('views'))
+ categories = asset.get('keywords', '').split(',')
+
+ thumbnails = [{
+ 'id': thumbnail.get('@type'),
+ 'url': thumbnail.get('url'),
+ } for _, thumbnail in asset.get('imageVersions', {}).items()]
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'categories': categories,
+ 'formats': formats,
+ }
+
+
+class TV2ArticleIE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
+ 'info_dict': {
+ 'id': '6930542',
+ 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret',
+ 'description': 'md5:339573779d3eea3542ffe12006190954',
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'http://www.tv2.no/a/6930542',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2')
+ for video_id in re.findall(r'data-assetid="(\d+)"', webpage)]
+
+ title = remove_end(self._og_search_title(webpage), ' - TV2.no')
+ description = remove_end(self._og_search_description(webpage), ' - TV2.no')
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index 102362b29..dc3a8334a 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -5,7 +5,9 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
float_or_none,
+ int_or_none,
parse_age_limit,
)
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
'display_id': 'sokrat',
'ext': 'flv',
'title': 'Сократ',
- 'description': 'md5:a05bd01be310074d5833efc6743be95e',
+ 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
'duration': 6586,
- 'age_limit': 0,
+ 'age_limit': 12,
},
+ 'skip': 'georestricted',
},
{
'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
- 'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+ 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '5142516',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'duration': 186.080,
'age_limit': 0,
},
+ 'skip': 'georestricted',
}, {
'url': 'https://cloud.tvigle.ru/video/5267604/',
'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
- r'<li class="video-preview current_playing" id="(\d+)">',
+ r'class="video-preview current_playing" id="(\d+)">',
webpage, 'video id')
video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
item = video_data['playlist']['items'][0]
+ videos = item.get('videos')
+
+ error_message = item.get('errorMessage')
+ if not videos and error_message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
title = item['title']
- description = item['description']
- thumbnail = item['thumbnail']
+ description = item.get('description')
+ thumbnail = item.get('thumbnail')
duration = float_or_none(item.get('durationMilliseconds'), 1000)
age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
for vcodec, fmts in item['videos'].items():
- for quality, video_url in fmts.items():
+ for format_id, video_url in fmts.items():
+ if format_id == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=vcodec))
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
formats.append({
'url': video_url,
- 'format_id': '%s-%s' % (vcodec, quality),
+ 'format_id': '%s-%s' % (vcodec, format_id),
'vcodec': vcodec,
- 'height': int(quality[:-1]),
- 'filesize': item['video_files_size'][vcodec][quality],
+ 'height': int_or_none(height),
+ 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
index 67e8bfea0..c1ee1decc 100644
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.24video.net/video/view/1044982',
- 'md5': '48dd7646775690a80447a8dca6a2df76',
+ 'md5': 'd041af8b5b4246ea466226a0d6693345',
'info_dict': {
'id': '1044982',
'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
webpage, 'upload date'))
uploader = self._html_search_regex(
- r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+ r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
webpage, 'uploader', fatal=False)
view_count = int_or_none(self._html_search_regex(
diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py
index 96c809eaf..c4751050e 100644
--- a/youtube_dl/extractor/ultimedia.py
+++ b/youtube_dl/extractor/ultimedia.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
qualities,
@@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- deliver_url = self._search_regex(
- r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
- webpage, 'deliver URL')
+ deliver_url = self._proto_relative_url(self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+ webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
deliver_page = self._download_webpage(
deliver_url, video_id, 'Downloading iframe page')
@@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor):
player = self._parse_json(
self._search_regex(
- r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'),
+ r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
+ deliver_page, 'player'),
video_id)
quality = qualities(['flash', 'html5'])
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index e6ee1e471..f38a72fde 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+)
class VGTVIE(InfoExtractor):
@@ -59,16 +62,16 @@ class VGTVIE(InfoExtractor):
},
{
# streamType: live
- 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+ 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
'info_dict': {
- 'id': '100015',
+ 'id': '113063',
'ext': 'flv',
- 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
- 'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+ 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:b3743425765355855f88e096acc93231',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 0,
- 'timestamp': 1407423348,
- 'upload_date': '20140807',
+ 'timestamp': 1432975582,
+ 'upload_date': '20150530',
'view_count': int,
},
'params': {
@@ -97,7 +100,12 @@ class VGTVIE(InfoExtractor):
% (host, video_id, HOST_WEBSITES[host]),
video_id, 'Downloading media JSON')
+ if data.get('status') == 'inactive':
+ raise ExtractorError(
+ 'Video %s is no longer available' % video_id, expected=True)
+
streams = data['streamUrls']
+ stream_type = data.get('streamType')
formats = []
@@ -107,7 +115,8 @@ class VGTVIE(InfoExtractor):
hls_url, video_id, 'mp4', m3u8_id='hls'))
hds_url = streams.get('hds')
- if hds_url:
+ # wasLive hds are always 404
+ if hds_url and stream_type != 'wasLive':
formats.extend(self._extract_f4m_formats(
hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
video_id, f4m_id='hds'))
@@ -135,13 +144,14 @@ class VGTVIE(InfoExtractor):
return {
'id': video_id,
- 'title': data['title'],
+ 'title': self._live_title(data['title']),
'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'],
'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'],
'formats': formats,
+ 'is_live': True if stream_type == 'live' else False,
}
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index ececc7ee0..591024ead 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
formats = [
{
- 'url': base64.b64decode(res['u']).decode('utf-8'),
+ 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
'ext': 'flv',
'format_id': res['l'],
} for res in settings['res'] if res['u']
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index bd953fb4c..e0b55078b 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -10,7 +10,7 @@ from ..utils import (
class VidmeIE(InfoExtractor):
_VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://vid.me/QNB',
'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
'info_dict': {
@@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor):
'upload_date': '20140725',
'thumbnail': 're:^https?://.*\.jpg',
},
- }
+ }, {
+ # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+ 'url': 'https://vid.me/e/Wmur',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
+ url = url.replace('vid.me/e/', 'vid.me/')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 619039e51..15377097e 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -38,11 +38,14 @@ class VierIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
- r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+ [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
+ webpage, 'video id')
application = self._search_regex(
- r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+ [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+ webpage, 'application', default='vier_vod')
filename = self._search_regex(
- r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+ [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+ webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index cf6af1e5c..7f2fb1ca8 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -1,29 +1,65 @@
from __future__ import unicode_literals
-import re
+import time
+import hmac
+import hashlib
+import itertools
-from ..compat import (
- compat_urlparse,
- compat_urllib_request,
-)
from ..utils import (
ExtractorError,
- unescapeHTML,
- unified_strdate,
- US_RATINGS,
- determine_ext,
- mimetype2ext,
+ int_or_none,
+ parse_age_limit,
+ parse_iso8601,
)
from .common import InfoExtractor
-class VikiIE(InfoExtractor):
- IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+ _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+ _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+ _APP = '65535a'
+ _APP_VERSION = '2.2.5.1428709186'
+ _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+ def _prepare_call(self, path, timestamp=None):
+ path += '?' if '?' not in path else '&'
+ if not timestamp:
+ timestamp = int(time.time())
+ query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+ sig = hmac.new(
+ self._APP_SECRET.encode('ascii'),
+ query.encode('ascii'),
+ hashlib.sha1
+ ).hexdigest()
+ return self._API_URL_TEMPLATE % (query, sig)
+
+ def _call_api(self, path, video_id, note, timestamp=None):
+ resp = self._download_json(
+ self._prepare_call(path, timestamp), video_id, note)
+
+ error = resp.get('error')
+ if error:
+ if error == 'invalid timestamp':
+ resp = self._download_json(
+ self._prepare_call(path, int(resp['current_timestamp'])),
+ video_id, '%s (retry)' % note)
+ error = resp.get('error')
+ if error:
+ self._raise_error(resp['error'])
+
+ return resp
- # iPad2
- _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'
+ def _raise_error(self, error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error),
+ expected=True)
- _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+
+class VikiIE(VikiBaseIE):
+ IE_NAME = 'viki'
+ _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
_TESTS = [{
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': {
@@ -37,111 +73,218 @@ class VikiIE(InfoExtractor):
},
'skip': 'Blocked in the US',
}, {
+ # clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
- 'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
+ 'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
'info_dict': {
'id': '1067139v',
'ext': 'mp4',
+ 'title': "'The Avengers: Age of Ultron' Press Conference",
'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+ 'duration': 352,
+ 'timestamp': 1430380829,
'upload_date': '20150430',
- 'title': '\'The Avengers: Age of Ultron\' Press Conference',
+ 'uploader': 'Arirang TV',
+ 'like_count': int,
+ 'age_limit': 0,
}
}, {
'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
'info_dict': {
'id': '1048879v',
'ext': 'mp4',
- 'upload_date': '20140820',
- 'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
'title': 'Ankhon Dekhi',
+ 'duration': 6512,
+ 'timestamp': 1408532356,
+ 'upload_date': '20140820',
+ 'uploader': 'Spuul',
+ 'like_count': int,
+ 'age_limit': 13,
},
'params': {
- # requires ffmpeg
+ # m3u8 download
'skip_download': True,
}
+ }, {
+ # episode
+ 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+ 'md5': '190f3ef426005ba3a080a63325955bc3',
+ 'info_dict': {
+ 'id': '44699v',
+ 'ext': 'mp4',
+ 'title': 'Boys Over Flowers - Episode 1',
+ 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+ 'duration': 4155,
+ 'timestamp': 1270496524,
+ 'upload_date': '20100405',
+ 'uploader': 'group8',
+ 'like_count': int,
+ 'age_limit': 13,
+ }
+ }, {
+ # youtube external
+ 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+ 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+ 'info_dict': {
+ 'id': '50562v',
+ 'ext': 'mp4',
+ 'title': 'Poor Nastya [COMPLETE] - Episode 1',
+ 'description': '',
+ 'duration': 607,
+ 'timestamp': 1274949505,
+ 'upload_date': '20101213',
+ 'uploader': 'ad14065n',
+ 'uploader_id': 'ad14065n',
+ 'like_count': int,
+ 'age_limit': 13,
+ }
+ }, {
+ 'url': 'http://www.viki.com/player/44699v',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
- uploader_m = re.search(
- r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
- if uploader_m is None:
- uploader = None
- else:
- uploader = uploader_m.group(1).strip()
-
- rating_str = self._html_search_regex(
- r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
- 'rating information', default='').strip()
- age_limit = US_RATINGS.get(rating_str)
-
- req = compat_urllib_request.Request(
- 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
- req.add_header('User-Agent', self._USER_AGENT)
- info_webpage = self._download_webpage(
- req, video_id, note='Downloading info page')
- err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
- if err_msg:
- if 'not available in your region' in err_msg:
- raise ExtractorError(
- 'Video %s is blocked from your location.' % video_id,
- expected=True)
- else:
- raise ExtractorError('Viki said: ' + err_msg)
- mobj = re.search(
- r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
- if not mobj:
- raise ExtractorError('Unable to find video URL')
- video_url = unescapeHTML(mobj.group('url'))
- video_ext = mimetype2ext(mobj.group('mime_type'))
-
- if determine_ext(video_url) == 'm3u8':
- formats = self._extract_m3u8_formats(
- video_url, video_id, ext=video_ext)
- else:
- formats = [{
- 'url': video_url,
- 'ext': video_ext,
- }]
-
- upload_date_str = self._html_search_regex(
- r'"created_at":"([^"]+)"', info_webpage, 'upload date')
- upload_date = (
- unified_strdate(upload_date_str)
- if upload_date_str is not None
- else None
- )
-
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, info_webpage)
-
- return {
+ video = self._call_api(
+ 'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+ title = None
+ titles = video.get('titles')
+ if titles:
+ title = titles.get('en') or titles[titles.keys()[0]]
+ if not title:
+ title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+ container_titles = video.get('container', {}).get('titles')
+ if container_titles:
+ container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]]
+ title = '%s - %s' % (container_title, title)
+
+ descriptions = video.get('descriptions')
+ description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None
+
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('created_at'))
+ uploader = video.get('author')
+ like_count = int_or_none(video.get('likes', {}).get('count'))
+ age_limit = parse_age_limit(video.get('rating'))
+
+ thumbnails = []
+ for thumbnail_id, thumbnail in video.get('images', {}).items():
+ thumbnails.append({
+ 'id': thumbnail_id,
+ 'url': thumbnail.get('url'),
+ })
+
+ subtitles = {}
+ for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+ subtitles[subtitle_lang] = [{
+ 'ext': subtitles_format,
+ 'url': self._prepare_call(
+ 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+ } for subtitles_format in ('srt', 'vtt')]
+
+ result = {
'id': video_id,
'title': title,
- 'formats': formats,
'description': description,
- 'thumbnail': thumbnail,
- 'age_limit': age_limit,
+ 'duration': duration,
+ 'timestamp': timestamp,
'uploader': uploader,
- 'subtitles': video_subtitles,
- 'upload_date': upload_date,
+ 'like_count': like_count,
+ 'age_limit': age_limit,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
}
- def _get_subtitles(self, video_id, info_webpage):
- res = {}
- for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
- sturl = unescapeHTML(sturl_html)
- m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
- if not m:
- continue
- res[m.group('lang')] = [{
- 'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
- 'ext': 'vtt',
- }]
- return res
+ streams = self._call_api(
+ 'videos/%s/streams.json' % video_id, video_id,
+ 'Downloading video streams JSON')
+
+ if 'external' in streams:
+ result.update({
+ '_type': 'url_transparent',
+ 'url': streams['external']['url'],
+ })
+ return result
+
+ formats = []
+ for format_id, stream_dict in streams.items():
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
+ for protocol, format_dict in stream_dict.items():
+ if format_id == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+ else:
+ formats.append({
+ 'url': format_dict['url'],
+ 'format_id': '%s-%s' % (format_id, protocol),
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ result['formats'] = formats
+ return result
+
+
+class VikiChannelIE(VikiBaseIE):
+ IE_NAME = 'viki:channel'
+ _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+ _TESTS = [{
+ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+ 'info_dict': {
+ 'id': '50c',
+ 'title': 'Boys Over Flowers',
+ 'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+ },
+ 'playlist_count': 70,
+ }, {
+ 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+ 'info_dict': {
+ 'id': '1354c',
+ 'title': 'Poor Nastya [COMPLETE]',
+ 'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+ },
+ 'playlist_count': 127,
+ }, {
+ 'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.viki.com/artists/2141c-shinee',
+ 'only_matching': True,
+ }]
+
+ _PER_PAGE = 25
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ channel = self._call_api(
+ 'containers/%s.json' % channel_id, channel_id,
+ 'Downloading channel JSON')
+
+ titles = channel['titles']
+ title = titles.get('en') or titles[titles.keys()[0]]
+
+ descriptions = channel['descriptions']
+ description = descriptions.get('en') or descriptions[descriptions.keys()[0]]
+
+ entries = []
+ for video_type in ('episodes', 'clips', 'movies'):
+ for page_num in itertools.count(1):
+ page = self._call_api(
+ 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+ % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+ 'Downloading %s JSON page #%d' % (video_type, page_num))
+ for video in page['response']:
+ video_id = video['id']
+ entries.append(self.url_result(
+ 'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+ if not page['pagination']['next']:
+ break
+
+ return self.playlist_result(entries, channel_id, title, description)
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index c3fde53f5..a6d9b5fee 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):
links_code = self._search_regex(
r'''(?xs)
(?:
- <img\s+src="/im/play.gif".*?>|
+ <img\s+src="[^"]*/play.gif".*?>|
<!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
)
(.*?)
diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py
index 1eb24a3d6..faa167e65 100644
--- a/youtube_dl/extractor/vulture.py
+++ b/youtube_dl/extractor/vulture.py
@@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):
query_webpage = self._download_webpage(
query_url, display_id, note='Downloading query page')
params_json = self._search_regex(
- r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n',
+ r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
query_webpage,
'player params')
params = json.loads(params_json)
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index d6dec25ca..f69d46a28 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -37,7 +37,8 @@ class WimpIE(InfoExtractor):
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
+ [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"],
+ webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py
index 8c6241aed..7c9d8af6f 100644
--- a/youtube_dl/extractor/xminus.py
+++ b/youtube_dl/extractor/xminus.py
@@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):
r'minus_track\.dur_sec=\'([0-9]*?)\'',
webpage, 'duration', fatal=False))
filesize_approx = parse_filesize(self._html_search_regex(
- r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+ r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
webpage, 'approximate filesize', fatal=False))
tbr = int_or_none(self._html_search_regex(
r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
@@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):
description = re.sub(' *\r *', '\n', description)
enc_token = self._html_search_regex(
- r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
+ r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
token = ''.join(
c if pos == 3 else compat_chr(compat_ord(c) - 1)
for pos, c in enumerate(reversed(enc_token)))
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index bf4e659ac..f9afbdbab 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,6 +15,7 @@ from ..utils import (
unescapeHTML,
ExtractorError,
int_or_none,
+ mimetype2ext,
)
from .nbc import NBCSportsVPlayerIE
@@ -236,6 +237,22 @@ class YahooIE(InfoExtractor):
self._sort_formats(formats)
+ closed_captions = self._html_search_regex(
+ r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
+ default='[]')
+
+ cc_json = self._parse_json(closed_captions, video_id, fatal=False)
+ subtitles = {}
+ if cc_json:
+ for closed_caption in cc_json:
+ lang = closed_caption['lang']
+ if lang not in subtitles:
+ subtitles[lang] = []
+ subtitles[lang].append({
+ 'url': closed_caption['url'],
+ 'ext': mimetype2ext(closed_caption['content_type']),
+ })
+
return {
'id': video_id,
'display_id': display_id,
@@ -244,6 +261,7 @@ class YahooIE(InfoExtractor):
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
'duration': int_or_none(meta.get('duration')),
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index e58184adc..419f7b019 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# YouTube sets the expire time to about two months
expire_time=time.time() + 2 * 30 * 24 * 3600)
+ def _ids_to_results(self, ids):
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
def _login(self):
"""
Attempt to log in to YouTube.
@@ -1121,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.report_warning(
'Skipping DASH manifest: %r' % e, video_id)
else:
- # Hide the formats we found through non-DASH
+ # Remove the formats we found through non-DASH, they
+ # contain less info and it can be wrong, because we use
+ # fixed values (for example the resolution). See
+ # https://github.com/rg3/youtube-dl/issues/5774 for an
+ # example.
dash_keys = set(df['format_id'] for df in dash_formats)
- for f in formats:
- if f['format_id'] in dash_keys:
- f['format_id'] = 'nondash-%s' % f['format_id']
- f['preference'] = f.get('preference', 0) - 10000
+ formats = [f for f in formats if f['format_id'] not in dash_keys]
formats.extend(dash_formats)
# Check for malformed aspect ratio
@@ -1261,11 +1267,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _ids_to_results(self, ids):
- return [
- self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
@@ -1398,6 +1399,24 @@ class YoutubeChannelIE(InfoExtractor):
channel_id = self._match_id(url)
url = self._TEMPLATE_URL % channel_id
+
+ # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+ # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+ # otherwise fallback on channel by page extraction
+ channel_page = self._download_webpage(
+ url + '?view=57', channel_id,
+ 'Downloading channel page', fatal=False)
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_playlist_id = self._search_regex(
+ r'data-channel-external-id="([^"]+)"',
+ channel_page, 'channel id', default=None)
+ if channel_playlist_id and channel_playlist_id.startswith('UC'):
+ playlist_id = 'UU' + channel_playlist_id[2:]
+ return self.url_result(
+ compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
autogenerated = re.search(r'''(?x)
class="[^"]*?(?:
@@ -1601,20 +1620,10 @@ class YoutubeShowIE(InfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
"""
- Base class for extractors that fetch info from
- http://www.youtube.com/feed_ajax
+ Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- # use action_load_personal_feed instead of action_load_system_feed
- _PERSONAL_FEED = False
-
- @property
- def _FEED_TEMPLATE(self):
- action = 'action_load_system_feed'
- if self._PERSONAL_FEED:
- action = 'action_load_personal_feed'
- return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
@property
def IE_NAME(self):
@@ -1624,67 +1633,23 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
self._login()
def _real_extract(self, url):
- feed_entries = []
- paging = 0
- for i in itertools.count(1):
- info = self._download_json(
- self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i,
- transform_source=uppercase_escape)
- feed_html = info.get('feed_html') or info.get('content_html')
- load_more_widget_html = info.get('load_more_widget_html') or feed_html
- m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
- ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in ids)
- mobj = re.search(
- r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
- load_more_widget_html)
- if mobj is None:
- break
- paging = mobj.group('paging')
- return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-
-
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_NAME = 'youtube:recommended'
- IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
- _FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = 'Youtube Recommended videos'
-
-
-class YoutubeWatchLaterIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:watchlater'
- IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
-
- _TESTS = [] # override PlaylistIE tests
-
- def _real_extract(self, url):
- return self._extract_playlist('WL')
-
-
-class YoutubeHistoryIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:history'
- IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
- _TESTS = []
-
- def _real_extract(self, url):
- title = 'Youtube History'
- page = self._download_webpage('https://www.youtube.com/feed/history', title)
+ page = self._download_webpage(
+ 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
# The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
-
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ if not new_ids:
+ break
+
ids.extend(new_ids)
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
@@ -1692,17 +1657,25 @@ class YoutubeHistoryIE(YoutubePlaylistIE):
break
more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), title,
+ 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
transform_source=uppercase_escape)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': self._ids_to_results(ids),
- }
+ return self.playlist_result(
+ self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
+
+
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+ IE_NAME = 'youtube:watchlater'
+ IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+
+ _TESTS = [] # override PlaylistIE tests
+
+ def _real_extract(self, url):
+ return self._extract_playlist('WL')
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
@@ -1717,42 +1690,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
return self.url_result(playlist_id, 'YoutubePlaylist')
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:subscriptions'
- IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _TESTS = []
-
- def _real_extract(self, url):
- title = 'Youtube Subscriptions'
- page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
- # The extraction process is the same as for playlists, but the regex
- # for the video ids doesn't contain an index
- ids = []
- more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _FEED_NAME = 'recommended'
+ _PLAYLIST_TITLE = 'Youtube Recommended videos'
- for page_num in itertools.count(1):
- matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
- ids.extend(new_ids)
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _FEED_NAME = 'subscriptions'
+ _PLAYLIST_TITLE = 'Youtube Subscriptions'
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), title,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': self._ids_to_results(ids),
- }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+ _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PLAYLIST_TITLE = 'Youtube History'
class YoutubeTruncatedURLIE(InfoExtractor):
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 22dbc3aec..5a2315bd9 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None):
verbosity.add_option(
'--dump-pages', '--dump-intermediate-pages',
action='store_true', dest='dump_intermediate_pages', default=False,
- help='Print downloaded pages to debug problems (very verbose)')
+ help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
verbosity.add_option(
'--write-pages',
action='store_true', dest='write_pages', default=False,
@@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None):
help='Parse additional metadata like song title / artist from the video title. '
'The format syntax is the same as --output, '
'the parsed parameters replace existing values. '
- 'Additional templates: %(album), %(artist). '
+ 'Additional templates: %(album)s, %(artist)s. '
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
'"Coldplay - Paradise"')
postproc.add_option(
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py
index 8f825f785..774494efd 100644
--- a/youtube_dl/postprocessor/embedthumbnail.py
+++ b/youtube_dl/postprocessor/embedthumbnail.py
@@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
- elif info['ext'] == 'm4a':
+ elif info['ext'] in ['m4a', 'mp4']:
if not check_executable('AtomicParsley', ['-v']):
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
@@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
else:
- raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.')
+ raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
return [], info
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index ed9ed9ed6..52d198fa3 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1665,6 +1665,7 @@ def mimetype2ext(mt):
return {
'x-ms-wmv': 'wmv',
'x-mp4-fragmented': 'mp4',
+ 'ttml+xml': 'ttml',
}.get(res, res)
@@ -1848,9 +1849,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text)
for child in node:
- if child.tag == _x('ttml:br'):
+ if child.tag in (_x('ttml:br'), 'br'):
out += '\n' + str_or_empty(child.tail)
- elif child.tag == _x('ttml:span'):
+ elif child.tag in (_x('ttml:span'), 'span'):
out += str_or_empty(parse_node(child))
else:
out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1859,7 +1860,10 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p'))
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+
+ if not paras:
+ raise ValueError('Invalid dfxp/TTML subtitle')
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib['begin'])
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 38f00bc9b..9cf84ff71 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2015.05.15'
+__version__ = '2015.06.04.1'