aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py39
-rw-r--r--youtube_dl/extractor/__init__.py20
-rw-r--r--youtube_dl/extractor/aftonbladet.py11
-rw-r--r--youtube_dl/extractor/bilibili.py2
-rw-r--r--youtube_dl/extractor/brightcove.py31
-rw-r--r--youtube_dl/extractor/cbs.py23
-rw-r--r--youtube_dl/extractor/cnet.py19
-rw-r--r--youtube_dl/extractor/common.py6
-rw-r--r--youtube_dl/extractor/crunchyroll.py30
-rw-r--r--youtube_dl/extractor/discovery.py52
-rw-r--r--youtube_dl/extractor/dramafever.py160
-rw-r--r--youtube_dl/extractor/empflix.py2
-rw-r--r--youtube_dl/extractor/facebook.py2
-rw-r--r--youtube_dl/extractor/fivetv.py88
-rw-r--r--youtube_dl/extractor/generic.py233
-rw-r--r--youtube_dl/extractor/imgur.py6
-rw-r--r--youtube_dl/extractor/instagram.py11
-rw-r--r--youtube_dl/extractor/iprima.py16
-rw-r--r--youtube_dl/extractor/iqiyi.py296
-rw-r--r--youtube_dl/extractor/izlesene.py18
-rw-r--r--youtube_dl/extractor/kickstarter.py15
-rw-r--r--youtube_dl/extractor/liveleak.py16
-rw-r--r--youtube_dl/extractor/nfl.py6
-rw-r--r--youtube_dl/extractor/niconico.py3
-rw-r--r--youtube_dl/extractor/noco.py4
-rw-r--r--youtube_dl/extractor/nova.py179
-rw-r--r--youtube_dl/extractor/nowtv.py192
-rw-r--r--youtube_dl/extractor/patreon.py2
-rw-r--r--youtube_dl/extractor/porn91.py71
-rw-r--r--youtube_dl/extractor/pornhub.py12
-rw-r--r--youtube_dl/extractor/pornovoisines.py4
-rw-r--r--youtube_dl/extractor/prosiebensat1.py17
-rw-r--r--youtube_dl/extractor/qqmusic.py85
-rw-r--r--youtube_dl/extractor/rtbf.py20
-rw-r--r--youtube_dl/extractor/rtlnl.py7
-rw-r--r--youtube_dl/extractor/rtlnow.py174
-rw-r--r--youtube_dl/extractor/ruutu.py119
-rw-r--r--youtube_dl/extractor/senateisvp.py8
-rw-r--r--youtube_dl/extractor/soompi.py146
-rw-r--r--youtube_dl/extractor/spiegeltv.py50
-rw-r--r--youtube_dl/extractor/sunporno.py2
-rw-r--r--youtube_dl/extractor/teamcoco.py29
-rw-r--r--youtube_dl/extractor/tf1.py5
-rw-r--r--youtube_dl/extractor/theplatform.py23
-rw-r--r--youtube_dl/extractor/tlc.py15
-rw-r--r--youtube_dl/extractor/tnaflix.py12
-rw-r--r--youtube_dl/extractor/tube8.py14
-rw-r--r--youtube_dl/extractor/tubitv.py84
-rw-r--r--youtube_dl/extractor/tumblr.py22
-rw-r--r--youtube_dl/extractor/turbo.py4
-rw-r--r--youtube_dl/extractor/tvc.py109
-rw-r--r--youtube_dl/extractor/tvigle.py39
-rw-r--r--youtube_dl/extractor/tvplay.py17
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py4
-rw-r--r--youtube_dl/extractor/vbox7.py21
-rw-r--r--youtube_dl/extractor/vgtv.py28
-rw-r--r--youtube_dl/extractor/vidme.py9
-rw-r--r--youtube_dl/extractor/vk.py4
-rw-r--r--youtube_dl/extractor/youtube.py69
-rw-r--r--youtube_dl/options.py4
-rw-r--r--youtube_dl/postprocessor/embedthumbnail.py4
-rw-r--r--youtube_dl/update.py2
-rw-r--r--youtube_dl/version.py2
63 files changed, 2206 insertions, 511 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index d1953c18f..aacec2958 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -49,6 +49,7 @@ from .utils import (
ExtractorError,
format_bytes,
formatSeconds,
+ HEADRequest,
locked_file,
make_HTTPS_handler,
MaxDownloadsReached,
@@ -118,7 +119,7 @@ class YoutubeDL(object):
username: Username for authentication purposes.
password: Password for authentication purposes.
- videopassword: Password for acces a video.
+ videopassword: Password for accessing a video.
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
@@ -923,8 +924,9 @@ class YoutubeDL(object):
if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
if audiovideo_formats:
return audiovideo_formats[format_idx]
- # for audio only urls, select the best/worst audio format
- elif all(f.get('acodec') != 'none' for f in available_formats):
+ # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+ elif (all(f.get('acodec') != 'none' for f in available_formats) or
+ all(f.get('vcodec') != 'none' for f in available_formats)):
return available_formats[format_idx]
elif format_spec == 'bestaudio':
audio_formats = [
@@ -1014,13 +1016,13 @@ class YoutubeDL(object):
info_dict['display_id'] = info_dict['id']
if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
- # Working around negative timestamps in Windows
- # (see http://bugs.python.org/issue1646728)
- if info_dict['timestamp'] < 0 and os.name == 'nt':
- info_dict['timestamp'] = 0
- upload_date = datetime.datetime.utcfromtimestamp(
- info_dict['timestamp'])
- info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+ # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+ # see http://bugs.python.org/issue1646728)
+ try:
+ upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+ info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+ except (ValueError, OverflowError, OSError):
+ pass
if self.params.get('listsubtitles', False):
if 'automatic_captions' in info_dict:
@@ -1047,6 +1049,8 @@ class YoutubeDL(object):
if not formats:
raise ExtractorError('No video formats found!')
+ formats_dict = {}
+
# We check that all the formats have the format and format_id fields
for i, format in enumerate(formats):
if 'url' not in format:
@@ -1054,6 +1058,18 @@ class YoutubeDL(object):
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
+ format_id = format['format_id']
+ if format_id not in formats_dict:
+ formats_dict[format_id] = []
+ formats_dict[format_id].append(format)
+
+ # Make sure all formats have unique format_id
+ for format_id, ambiguous_formats in formats_dict.items():
+ if len(ambiguous_formats) > 1:
+ for i, format in enumerate(ambiguous_formats):
+ format['format_id'] = '%s-%d' % (format_id, i)
+
+ for i, format in enumerate(formats):
if format.get('format') is None:
format['format'] = '{id} - {res}{note}'.format(
id=format['format_id'],
@@ -1706,7 +1722,8 @@ class YoutubeDL(object):
if req_is_string:
req = url_escaped
else:
- req = compat_urllib_request.Request(
+ req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+ req = req_type(
url_escaped, data=req.data, headers=req.headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 80c9cb107..6fdaf90b2 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -112,6 +112,10 @@ from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
from .douyutv import DouyuTVIE
+from .dramafever import (
+ DramaFeverIE,
+ DramaFeverSeriesIE,
+)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
@@ -152,6 +156,7 @@ from .fc2 import FC2IE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
@@ -229,6 +234,7 @@ from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
from .ivi import (
IviIE,
IviCompilationIE
@@ -352,8 +358,10 @@ from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
+from .nova import NovaIE
from .novamov import NovaMovIE
from .nowness import NownessIE
+from .nowtv import NowTVIE
from .nowvideo import NowVideoIE
from .npo import (
NPOIE,
@@ -400,6 +408,7 @@ from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .playwire import PlaywireIE
from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
from .pornhd import PornHdIE
from .pornhub import (
PornHubIE,
@@ -437,7 +446,6 @@ from .roxwel import RoxwelIE
from .rtbf import RTBFIE
from .rte import RteIE
from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
from .rtl2 import RTL2IE
from .rtp import RTPIE
from .rts import RTSIE
@@ -451,6 +459,7 @@ from .rutube import (
RutubePersonIE,
)
from .rutv import RUTVIE
+from .ruutu import RuutuIE
from .sandia import SandiaIE
from .safari import (
SafariIE,
@@ -480,6 +489,10 @@ from .smotri import (
)
from .snotr import SnotrIE
from .sohu import SohuIE
+from .soompi import (
+ SoompiIE,
+ SoompiShowIE,
+)
from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
@@ -565,6 +578,7 @@ from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tunein import TuneInIE
@@ -575,6 +589,10 @@ from .tv2 import (
TV2ArticleIE,
)
from .tv4 import TV4IE
+from .tvc import (
+ TVCIE,
+ TVCArticleIE,
+)
from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
index a117502bc..e0518cf26 100644
--- a/youtube_dl/extractor/aftonbladet.py
+++ b/youtube_dl/extractor/aftonbladet.py
@@ -6,11 +6,11 @@ from ..utils import int_or_none
class AftonbladetIE(InfoExtractor):
- _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])'
+ _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+ 'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'info_dict': {
- 'id': 'article36015',
+ 'id': '36015',
'ext': 'mp4',
'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
# find internal video meta data
meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
- internal_meta_id = self._html_search_regex(
- r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+ player_config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+ internal_meta_id = player_config['videoId']
internal_meta_url = meta_url % internal_meta_id
internal_meta_json = self._download_json(
internal_meta_url, video_id, 'Downloading video meta data')
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 2103ed73a..bf60450c2 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -105,7 +105,7 @@ class BiliBiliIE(InfoExtractor):
'filesize': int_or_none(
lq_durl.find('./size'), get_attr='text'),
}]
- if hq_durl:
+ if hq_durl is not None:
formats.append({
'format_id': 'hq',
'quality': 2,
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 4f60d5366..d768f99e6 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -156,6 +156,28 @@ class BrightcoveIE(InfoExtractor):
linkBase = find_param('linkBaseURL')
if linkBase is not None:
params['linkBaseURL'] = linkBase
+ return cls._make_brightcove_url(params)
+
+ @classmethod
+ def _build_brighcove_url_from_js(cls, object_js):
+ # The layout of JS is as follows:
+ # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+ # // build Brightcove <object /> XML
+ # }
+ m = re.search(
+ r'''(?x)customBC.\createVideo\(
+ .*? # skipping width and height
+ ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID
+ ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters
+ # in length, however it's appended to itself
+ # in places, so truncate
+ ["\'](?P<videoID>\d+)["\'] # @videoPlayer
+ ''', object_js)
+ if m:
+ return cls._make_brightcove_url(m.groupdict())
+
+ @classmethod
+ def _make_brightcove_url(cls, params):
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
@@ -172,7 +194,7 @@ class BrightcoveIE(InfoExtractor):
"""Return a list of all Brightcove URLs from the webpage """
url_m = re.search(
- r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+ r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
webpage)
if url_m:
url = unescapeHTML(url_m.group(1))
@@ -188,7 +210,12 @@ class BrightcoveIE(InfoExtractor):
[^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
).+?>\s*</object>''',
webpage)
- return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+ if matches:
+ return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+ return list(filter(None, [
+ cls._build_brighcove_url_from_js(custom_bc)
+ for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index 1ceb9d8d9..75fffb156 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -4,12 +4,13 @@ from .common import InfoExtractor
class CBSIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*'
+ _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '4JUVEwq3wUT7',
+ 'display_id': 'connect-chat-feat-garth-brooks',
'ext': 'flv',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -24,6 +25,7 @@ class CBSIE(InfoExtractor):
'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
'info_dict': {
'id': 'WWF_5KqY3PK1',
+ 'display_id': 'st-vincent',
'ext': 'flv',
'title': 'Live on Letterman - St. Vincent',
'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
@@ -34,12 +36,23 @@ class CBSIE(InfoExtractor):
'skip_download': True,
},
'_skip': 'Blocked outside the US',
+ }, {
+ 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
real_id = self._search_regex(
- r"video\.settings\.pid\s*=\s*'([^']+)';",
+ [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
webpage, 'real video ID')
- return self.url_result('theplatform:%s' % real_id)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': 'theplatform:%s' % real_id,
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 3145b3051..5dd69bff7 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -11,7 +11,7 @@ from ..utils import (
class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
@@ -25,7 +25,20 @@ class CNETIE(InfoExtractor):
'params': {
'skip_download': 'requires rtmpdump',
}
- }
+ }, {
+ 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+ 'info_dict': {
+ 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
+ 'ext': 'flv',
+ 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+ 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+ 'uploader': 'Ashley Esqueda',
+ 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -42,7 +55,7 @@ class CNETIE(InfoExtractor):
raise ExtractorError('Cannot find video data')
mpx_account = data['config']['players']['default']['mpx_account']
- vid = vdata['files']['rtmp']
+ vid = vdata['files'].get('rtmp', vdata['files']['hds'])
tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
video_id = vdata['id']
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index cecf917ff..49e4dc710 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -846,7 +846,7 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
- m3u8_id=None):
+ m3u8_id=None, note=None, errnote=None):
formats = [{
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
@@ -865,8 +865,8 @@ class InfoExtractor(object):
m3u8_doc = self._download_webpage(
m3u8_url, video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information')
last_info = None
last_media = None
kv_rex = re.compile(
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 1c77df47e..41f0c736d 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor):
self._login()
def _decrypt_subtitles(self, data, iv, id):
- data = bytes_to_intlist(data)
- iv = bytes_to_intlist(iv)
+ data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+ iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
id = int(id)
def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output
+ def _extract_subtitles(self, subtitle):
+ sub_root = xml.etree.ElementTree.fromstring(subtitle)
+ return [{
+ 'ext': 'srt',
+ 'data': self._convert_subtitles_to_srt(sub_root),
+ }, {
+ 'ext': 'ass',
+ 'data': self._convert_subtitles_to_ass(sub_root),
+ }]
+
def _get_subtitles(self, video_id, webpage):
subtitles = {}
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
if not id or not iv or not data:
continue
- id = int(id)
- iv = base64.b64decode(iv)
- data = base64.b64decode(data)
-
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
- sub_root = xml.etree.ElementTree.fromstring(subtitle)
- subtitles[lang_code] = [
- {
- 'ext': 'srt',
- 'data': self._convert_subtitles_to_srt(sub_root),
- },
- {
- 'ext': 'ass',
- 'data': self._convert_subtitles_to_ass(sub_root),
- },
- ]
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
return subtitles
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index d3e667528..d6723ecf2 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -2,19 +2,19 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
+ parse_duration,
parse_iso8601,
- int_or_none,
)
+from ..compat import compat_str
class DiscoveryIE(InfoExtractor):
_VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
'info_dict': {
- 'id': 'mission-impossible-outtakes',
- 'ext': 'flv',
+ 'id': '20769',
+ 'ext': 'mp4',
'title': 'Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor):
'timestamp': 1303099200,
'upload_date': '20110418',
},
- }
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
+ }, {
+ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
+ 'info_dict': {
+ 'id': 'mythbusters-the-simpsons',
+ 'title': 'MythBusters: The Simpsons',
+ },
+ 'playlist_count': 9,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ info = self._download_json(url + '?flat=1', video_id)
- info = self._parse_json(self._search_regex(
- r'(?s)<script type="application/ld\+json">(.*?)</script>',
- webpage, 'video info'), video_id)
+ video_title = info.get('playlist_title') or info.get('video_title')
- return {
- 'id': video_id,
- 'title': info['name'],
- 'url': info['contentURL'],
- 'description': info.get('description'),
- 'thumbnail': info.get('thumbnailUrl'),
- 'timestamp': parse_iso8601(info.get('uploadDate')),
- 'duration': int_or_none(info.get('duration')),
- }
+ entries = [{
+ 'id': compat_str(video_info['id']),
+ 'formats': self._extract_m3u8_formats(
+ video_info['src'], video_id, ext='mp4',
+ note='Download m3u8 information for video %d' % (idx + 1)),
+ 'title': video_info['title'],
+ 'description': video_info.get('description'),
+ 'duration': parse_duration(video_info.get('video_length')),
+ 'webpage_url': video_info.get('href'),
+ 'thumbnail': video_info.get('thumbnailURL'),
+ 'alt_title': video_info.get('secondary_title'),
+ 'timestamp': parse_iso8601(video_info.get('publishedDate')),
+ } for idx, video_info in enumerate(info['playlist'])]
+
+ return self.playlist_result(entries, video_id, video_title)
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
new file mode 100644
index 000000000..a34aad486
--- /dev/null
+++ b/youtube_dl/extractor/dramafever.py
@@ -0,0 +1,160 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class DramaFeverIE(InfoExtractor):
+ IE_NAME = 'dramafever'
+ _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
+ _TEST = {
+ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
+ 'info_dict': {
+ 'id': '4512.1',
+ 'ext': 'flv',
+ 'title': 'Cooking with Shin 4512.1',
+ 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1404336058,
+ 'upload_date': '20140702',
+ 'duration': 343,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url).replace('/', '.')
+
+ try:
+ feed = self._download_json(
+ 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
+ video_id, 'Downloading episode JSON')['channel']['item']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError):
+ raise ExtractorError(
+ 'Currently unavailable in your country.', expected=True)
+ raise
+
+ media_group = feed.get('media-group', {})
+
+ formats = []
+ for media_content in media_group['media-content']:
+ src = media_content.get('@attributes', {}).get('url')
+ if not src:
+ continue
+ ext = determine_ext(src)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src, video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ title = media_group.get('media-title')
+ description = media_group.get('media-description')
+ duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
+ thumbnail = self._proto_relative_url(
+ media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
+ timestamp = parse_iso8601(feed.get('pubDate'), ' ')
+
+ subtitles = {}
+ for media_subtitle in media_group.get('media-subTitle', []):
+ lang = media_subtitle.get('@attributes', {}).get('lang')
+ href = media_subtitle.get('@attributes', {}).get('href')
+ if not lang or not href:
+ continue
+ subtitles[lang] = [{
+ 'ext': 'ttml',
+ 'url': href,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class DramaFeverSeriesIE(InfoExtractor):
+ IE_NAME = 'dramafever:series'
+ _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
+ _TESTS = [{
+ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/',
+ 'info_dict': {
+ 'id': '4512',
+ 'title': 'Cooking with Shin',
+ 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'url': 'http://www.dramafever.com/drama/124/IRIS/',
+ 'info_dict': {
+ 'id': '124',
+ 'title': 'IRIS',
+ 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862',
+ },
+ 'playlist_count': 20,
+ }]
+
+ _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
+ _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-)
+
+ def _get_consumer_secret(self, video_id):
+ mainjs = self._download_webpage(
+ 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js',
+ video_id, 'Downloading main.js', fatal=False)
+ if not mainjs:
+ return self._CONSUMER_SECRET
+ return self._search_regex(
+ r"var\s+cs\s*=\s*'([^']+)'", mainjs,
+ 'consumer secret', default=self._CONSUMER_SECRET)
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+
+ consumer_secret = self._get_consumer_secret(series_id)
+
+ series = self._download_json(
+ 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s'
+ % (consumer_secret, series_id),
+ series_id, 'Downloading series JSON')['series'][series_id]
+
+ title = clean_html(series['name'])
+ description = clean_html(series.get('description') or series.get('description_short'))
+
+ entries = []
+ for page_num in itertools.count(1):
+ episodes = self._download_json(
+ 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d'
+ % (consumer_secret, series_id, self._PAGE_SIZE, page_num),
+ series_id, 'Downloading episodes JSON page #%d' % page_num)
+ for episode in episodes.get('value', []):
+ entries.append(self.url_result(
+ compat_urlparse.urljoin(url, episode['episode_url']),
+ 'DramaFever', episode.get('guid')))
+ if page_num == episodes['num_pages']:
+ break
+
+ return self.playlist_result(entries, series_id, title, description)
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
index 9a5a8f4bb..4827022e0 100644
--- a/youtube_dl/extractor/empflix.py
+++ b/youtube_dl/extractor/empflix.py
@@ -26,6 +26,6 @@ class EMPFlixIE(TNAFlixIE):
},
{
'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
- 'matching_only': True,
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index e8d682716..82dc27bc6 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -152,7 +152,7 @@ class FacebookIE(InfoExtractor):
raise ExtractorError('Cannot find video formats')
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
default=None)
if not video_title:
video_title = self._html_search_regex(
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
new file mode 100644
index 000000000..13fbc4da2
--- /dev/null
+++ b/youtube_dl/extractor/fivetv.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ http://
+ (?:www\.)?5-tv\.ru/
+ (?:
+ (?:[^/]+/)+(?P<id>\d+)|
+ (?P<path>[^/?#]+)(?:[/?#])?
+ )
+ '''
+
+ _TESTS = [{
+ 'url': 'http://5-tv.ru/news/96814/',
+ 'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+ 'info_dict': {
+ 'id': '96814',
+ 'ext': 'mp4',
+ 'title': 'Россияне выбрали имя для общенациональной платежной системы',
+ 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 180,
+ },
+ }, {
+ 'url': 'http://5-tv.ru/video/1021729/',
+ 'info_dict': {
+ 'id': '1021729',
+ 'ext': 'mp4',
+ 'title': '3D принтер',
+ 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 180,
+ },
+ }, {
+ 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+ 'info_dict': {
+ 'id': 'glavnoe',
+ 'ext': 'mp4',
+ 'title': 'Итоги недели с 8 по 14 июня 2015 года',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/films/1507502/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/programs/broadcast/508713/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/angel/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('path')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+ webpage, 'video url')
+
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title')
+ duration = int_or_none(self._og_search_property(
+ 'video:duration', webpage, 'duration', default=None))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 9a7b0d25d..f6b984300 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -9,6 +9,8 @@ from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
)
@@ -32,6 +34,7 @@ from .brightcove import BrightcoveIE
from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
+from .tvc import TVCIE
from .sportbox import SportBoxEmbedIE
from .smotri import SmotriIE
from .condenast import CondeNastIE
@@ -39,6 +42,7 @@ from .udn import UDNEmbedIE
from .senateisvp import SenateISVPIE
from .bliptv import BlipTVIE
from .svt import SVTIE
+from .pornhub import PornHubIE
class GenericIE(InfoExtractor):
@@ -46,6 +50,97 @@ class GenericIE(InfoExtractor):
_VALID_URL = r'.*'
IE_NAME = 'generic'
_TESTS = [
+ # Direct link to a video
+ {
+ 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+ 'info_dict': {
+ 'id': 'trailer',
+ 'ext': 'mp4',
+ 'title': 'trailer',
+ 'upload_date': '20100513',
+ }
+ },
+ # Direct link to media delivered compressed (until Accept-Encoding is *)
+ {
+ 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+ 'md5': '128c42e68b13950268b648275386fc74',
+ 'info_dict': {
+ 'id': 'FictionJunction-Parallel_Hearts',
+ 'ext': 'flac',
+ 'title': 'FictionJunction-Parallel_Hearts',
+ 'upload_date': '20140522',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # RSS feed
+ {
+ 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'info_dict': {
+ 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'title': 'Zero Punctuation',
+ 'description': 're:.*groundbreaking video review series.*'
+ },
+ 'playlist_mincount': 11,
+ },
+ # RSS feed with enclosure
+ {
+ 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'info_dict': {
+ 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ 'ext': 'm4v',
+ 'upload_date': '20150228',
+ 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ }
+ },
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ },
{
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -125,17 +220,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, # m3u8 download
},
},
- # Direct link to a video
- {
- 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
- 'info_dict': {
- 'id': 'trailer',
- 'ext': 'mp4',
- 'title': 'trailer',
- 'upload_date': '20100513',
- }
- },
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -160,22 +244,6 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
- # google redirect
- {
- 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
- 'info_dict': {
- 'id': 'cmQHVoWB5FY',
- 'ext': 'mp4',
- 'upload_date': '20130224',
- 'uploader_id': 'TheVerge',
- 'description': 're:^Chris Ziegler takes a look at the\.*',
- 'uploader': 'The Verge',
- 'title': 'First Firefox OS phones side-by-side',
- },
- 'params': {
- 'skip_download': False,
- }
- },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -225,6 +293,15 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # TVC embed
+ {
+ 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+ 'info_dict': {
+ 'id': '55304',
+ 'ext': 'mp4',
+ 'title': 'Дошкольное воспитание',
+ },
+ },
# SportBox embed
{
'url': 'http://www.vestifinance.ru/articles/25753',
@@ -407,16 +484,6 @@ class GenericIE(InfoExtractor):
'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
- # RSS feed
- {
- 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'info_dict': {
- 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'title': 'Zero Punctuation',
- 'description': 're:.*groundbreaking video review series.*'
- },
- 'playlist_mincount': 11,
- },
# Multiple brightcove videos
# https://github.com/rg3/youtube-dl/issues/2283
{
@@ -470,21 +537,6 @@ class GenericIE(InfoExtractor):
'uploader': 'thoughtworks.wistia.com',
},
},
- # Direct download with broken HEAD
- {
- 'url': 'http://ai-radio.org:8000/radio.opus',
- 'info_dict': {
- 'id': 'radio',
- 'ext': 'opus',
- 'title': 'radio',
- },
- 'params': {
- 'skip_download': True, # infinite live stream
- },
- 'expected_warnings': [
- r'501.*Not Implemented'
- ],
- },
# Soundcloud embed
{
'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -516,21 +568,6 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 2,
},
- # Direct link with incorrect MIME type
- {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'md5': '4ccbebe5f36706d85221f204d7eb5913',
- 'info_dict': {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'id': '5_Lennart_Poettering_-_Systemd',
- 'ext': 'webm',
- 'title': '5_Lennart_Poettering_-_Systemd',
- 'upload_date': '20141120',
- },
- 'expected_warnings': [
- 'URL could be a direct video link, returning it as such.'
- ]
- },
# Cinchcast embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -689,16 +726,6 @@ class GenericIE(InfoExtractor):
'age_limit': 0,
},
},
- # RSS feed with enclosure
- {
- 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
- 'info_dict': {
- 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- 'ext': 'm4v',
- 'upload_date': '20150228',
- 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- }
- },
# Crooks and Liars embed
{
'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@@ -773,6 +800,18 @@ class GenericIE(InfoExtractor):
# rtmpe downloads
'skip_download': True,
}
+ },
+ # Brightcove URL in single quotes
+ {
+ 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+ 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+ 'info_dict': {
+ 'id': '4255764656001',
+ 'ext': 'mp4',
+ 'title': 'SN Presents: Russell Martin, World Citizen',
+ 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+ 'uploader': 'Rogers Sportsnet',
+ },
}
]
@@ -894,7 +933,7 @@ class GenericIE(InfoExtractor):
force_videoid = smuggled_data['force_videoid']
video_id = force_videoid
else:
- video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+ video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
self.to_screen('%s: Requesting header' % video_id)
@@ -916,7 +955,9 @@ class GenericIE(InfoExtractor):
full_response = None
if head_response is False:
- full_response = self._request_webpage(url, video_id)
+ request = compat_urllib_request.Request(url)
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
head_response = full_response
# Check for direct link to a video
@@ -927,7 +968,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'formats': [{
'format_id': m.group('format_id'),
@@ -941,7 +982,17 @@ class GenericIE(InfoExtractor):
self._downloader.report_warning('Falling back on generic information extractor.')
if not full_response:
- full_response = self._request_webpage(url, video_id)
+ request = compat_urllib_request.Request(url)
+ # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+ # making it impossible to download only chunk of the file (yet we need only 512kB to
+ # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+ # that will always result in downloading the whole file that is not desirable.
+ # Therefore for extraction pass we have to override Accept-Encoding to any in order
+ # to accept raw bytes and being able to download only a chunk.
+ # It may probably better to solve this by checking Content-Type for application/octet-stream
+ # after HEAD request finishes, but not sure if we can rely on this.
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
@@ -953,7 +1004,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'url': url,
'upload_date': upload_date,
@@ -1033,7 +1084,7 @@ class GenericIE(InfoExtractor):
# Look for embedded rtl.nl player
matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
+ r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
webpage)
if matches:
return _playlist_from_matches(matches, ie='RtlNl')
@@ -1261,11 +1312,27 @@ class GenericIE(InfoExtractor):
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
+ # Look for embedded TVC player
+ tvc_url = TVCIE._extract_url(webpage)
+ if tvc_url:
+ return self.url_result(tvc_url, 'TVC')
+
# Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
if sportbox_urls:
return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+ # Look for embedded PornHub player
+ pornhub_url = PornHubIE._extract_url(webpage)
+ if pornhub_url:
+ return self.url_result(pornhub_url, 'PornHub')
+
+ # Look for embedded Tvigle player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Tvigle')
+
# Look for embedded TED player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py
index fe5d95e2c..d692ea79a 100644
--- a/youtube_dl/extractor/imgur.py
+++ b/youtube_dl/extractor/imgur.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
int_or_none,
js_to_json,
@@ -12,7 +13,7 @@ from ..utils import (
class ImgurIE(InfoExtractor):
- _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
@@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(
+ compat_urlparse.urljoin(url, video_id), video_id)
width = int_or_none(self._search_regex(
r'<param name="width" value="([0-9]+)"',
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index b10755788..3d78f78c4 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ limit_length,
+)
class InstagramIE(InfoExtractor):
@@ -100,11 +103,13 @@ class InstagramUserIE(InfoExtractor):
thumbnails_el = it.get('images', {})
thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
- title = it.get('caption', {}).get('text', it['id'])
+ # In some cases caption is null, which corresponds to None
+ # in python. As a result, it.get('caption', {}) gives None
+ title = (it.get('caption') or {}).get('text', it['id'])
entries.append({
'id': it['id'],
- 'title': title,
+ 'title': limit_length(title, 80),
'formats': formats,
'thumbnail': thumbnail,
'webpage_url': it.get('link'),
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 8529bedfc..821c8ec10 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -11,11 +11,12 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ remove_end,
)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):
'id': '39152',
'ext': 'flv',
'title': 'Partička (92)',
- 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+ 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
@@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):
'id': '9718337',
'ext': 'flv',
'title': 'Tchibo Partička - Jarní móda',
- 'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
'thumbnail': 're:^http:.*\.jpg$',
},
'params': {
'skip_download': True, # requires rtmpdump
},
- 'skip': 'Do not have permission to access this page',
+ }, {
+ 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):
return {
'id': real_id,
- 'title': self._og_search_title(webpage),
+ 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
- 'description': self._og_search_description(webpage),
+ 'description': self._search_regex(
+ r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+ webpage, 'description', default=None),
}
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
new file mode 100644
index 000000000..9106dd074
--- /dev/null
+++ b/youtube_dl/extractor/iqiyi.py
@@ -0,0 +1,296 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import math
+import os.path
+import random
+import re
+import time
+import uuid
+import zlib
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ url_basename,
+)
+
+
+class IqiyiIE(InfoExtractor):
+ IE_NAME = 'iqiyi'
+
+ _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
+
+ _TESTS = [{
+ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+ 'md5': '2cb594dc2781e6c941a110d8f358118b',
+ 'info_dict': {
+ 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+ 'title': '美国德州空中惊现奇异云团 酷似UFO',
+ 'ext': 'f4v',
+ }
+ }, {
+ 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb',
+ 'title': '名侦探柯南第752集',
+ },
+ 'playlist': [{
+ 'md5': '7e49376fecaffa115d951634917fe105',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': '41b75ba13bb7ac0e411131f92bc4f6ca',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': '0cee1dd0a3d46a83e71e2badeae2aab0',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': '4f8ad72373b0c491b582e7c196b0b1f9',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': 'd89ad028bcfad282918e8098e811711d',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': '9cb1e5c95da25dff0660c32ae50903b7',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': '155116e0ff1867bbc9b98df294faabc9',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'md5': '53f5db77622ae14fa493ed2a278a082b',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }],
+ }]
+
+ _FORMATS_MAP = [
+ ('1', 'h6'),
+ ('2', 'h5'),
+ ('3', 'h4'),
+ ('4', 'h3'),
+ ('5', 'h2'),
+ ('10', 'h1'),
+ ]
+
+ def construct_video_urls(self, data, video_id, _uuid):
+ def do_xor(x, y):
+ a = y % 3
+ if a == 1:
+ return x ^ 121
+ if a == 2:
+ return x ^ 72
+ return x ^ 103
+
+ def get_encode_code(l):
+ a = 0
+ b = l.split('-')
+ c = len(b)
+ s = ''
+ for i in range(c - 1, -1, -1):
+ a = do_xor(int(b[c - i - 1], 16), i)
+ s += chr(a)
+ return s[::-1]
+
+ def get_path_key(x, format_id, segment_index):
+ mg = ')(*&^flash@#$%a'
+ tm = self._download_json(
+ 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
+ note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
+ )['t']
+ t = str(int(math.floor(int(tm) / (600.0))))
+ return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
+
+ video_urls_dict = {}
+ for format_item in data['vp']['tkl'][0]['vs']:
+ if 0 < int(format_item['bid']) <= 10:
+ format_id = self.get_format(format_item['bid'])
+ else:
+ continue
+
+ video_urls = []
+
+ video_urls_info = format_item['fs']
+ if not format_item['fs'][0]['l'].startswith('/'):
+ t = get_encode_code(format_item['fs'][0]['l'])
+ if t.endswith('mp4'):
+ video_urls_info = format_item['flvs']
+
+ for segment_index, segment in enumerate(video_urls_info):
+ vl = segment['l']
+ if not vl.startswith('/'):
+ vl = get_encode_code(vl)
+ key = get_path_key(
+ vl.split('/')[-1].split('.')[0], format_id, segment_index)
+ filesize = segment['b']
+ base_url = data['vp']['du'].split('/')
+ base_url.insert(-1, key)
+ base_url = '/'.join(base_url)
+ param = {
+ 'su': _uuid,
+ 'qyid': uuid.uuid4().hex,
+ 'client': '',
+ 'z': '',
+ 'bt': '',
+ 'ct': '',
+ 'tn': str(int(time.time()))
+ }
+ api_video_url = base_url + vl + '?' + \
+ compat_urllib_parse.urlencode(param)
+ js = self._download_json(
+ api_video_url, video_id,
+ note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
+ video_url = js['l']
+ video_urls.append(
+ (video_url, filesize))
+
+ video_urls_dict[format_id] = video_urls
+ return video_urls_dict
+
+ def get_format(self, bid):
+ matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
+ return matched_format_ids[0] if len(matched_format_ids) else None
+
+ def get_bid(self, format_id):
+ matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
+ return matched_bids[0] if len(matched_bids) else None
+
+ def get_raw_data(self, tvid, video_id, enc_key, _uuid):
+ tm = str(int(time.time()))
+ param = {
+ 'key': 'fvip',
+ 'src': hashlib.md5(b'youtube-dl').hexdigest(),
+ 'tvId': tvid,
+ 'vid': video_id,
+ 'vinfo': 1,
+ 'tm': tm,
+ 'enc': hashlib.md5(
+ (enc_key + tm + tvid).encode('utf8')).hexdigest(),
+ 'qyid': _uuid,
+ 'tn': random.random(),
+ 'um': 0,
+ 'authkey': hashlib.md5(
+ (tm + tvid).encode('utf8')).hexdigest()
+ }
+
+ api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
+ compat_urllib_parse.urlencode(param)
+ raw_data = self._download_json(api_url, video_id)
+ return raw_data
+
+ def get_enc_key(self, swf_url, video_id):
+ filename, _ = os.path.splitext(url_basename(swf_url))
+ enc_key_json = self._downloader.cache.load('iqiyi-enc-key', filename)
+ if enc_key_json is not None:
+ return enc_key_json[0]
+
+ req = self._request_webpage(
+ swf_url, video_id, note='download swf content')
+ cn = req.read()
+ cn = zlib.decompress(cn[8:])
+ pt = re.compile(b'MixerRemote\x08(?P<enc_key>.+?)\$&vv')
+ enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8')
+
+ self._downloader.cache.store('iqiyi-enc-key', filename, [enc_key])
+
+ return enc_key
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(
+ url, 'temp_id', note='download video page')
+ tvid = self._search_regex(
+ r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+ video_id = self._search_regex(
+ r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+ swf_url = self._search_regex(
+ r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
+ _uuid = uuid.uuid4().hex
+
+ enc_key = self.get_enc_key(swf_url, video_id)
+
+ raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
+
+ if raw_data['code'] != 'A000000':
+ raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+ if not raw_data['data']['vp']['tkl']:
+ raise ExtractorError('No support iQiqy VIP video')
+
+ data = raw_data['data']
+
+ title = data['vi']['vn']
+
+ # generate video_urls_dict
+ video_urls_dict = self.construct_video_urls(
+ data, video_id, _uuid)
+
+ # construct info
+ entries = []
+ for format_id in video_urls_dict:
+ video_urls = video_urls_dict[format_id]
+ for i, video_url_info in enumerate(video_urls):
+ if len(entries) < i + 1:
+ entries.append({'formats': []})
+ entries[i]['formats'].append(
+ {
+ 'url': video_url_info[0],
+ 'filesize': video_url_info[-1],
+ 'format_id': format_id,
+ 'preference': int(self.get_bid(format_id))
+ }
+ )
+
+ for i in range(len(entries)):
+ self._sort_formats(entries[i]['formats'])
+ entries[i].update(
+ {
+ 'id': '%s_part%d' % (video_id, i + 1),
+ 'title': title,
+ }
+ )
+
+ if len(entries) > 1:
+ info = {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'title': title,
+ 'entries': entries,
+ }
+ else:
+ info = entries[0]
+ info['id'] = video_id
+ info['title'] = title
+
+ return info
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 99a1361f8..bc226fa67 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404302298,
+ 'timestamp': int,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163322193,
+ 'timestamp': int,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
uploader = self._html_search_regex(
r"adduserUsername\s*=\s*'([^']+)';",
- webpage, 'uploader', fatal=False, default='')
+ webpage, 'uploader', fatal=False)
timestamp = parse_iso8601(self._html_search_meta(
- 'uploadDate', webpage, 'upload date', fatal=False))
+ 'uploadDate', webpage, 'upload date'))
duration = float_or_none(self._html_search_regex(
r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
# Might be empty for some videos.
streams = self._html_search_regex(
- r'"qualitylevel"\s*:\s*"([^"]+)"',
- webpage, 'streams', fatal=False, default='')
+ r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
formats = []
if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
formats.append({
'format_id': '%sp' % quality if quality else 'sd',
- 'url': url,
+ 'url': compat_urllib_parse_unquote(url),
'ext': ext,
})
else:
stream_url = self._search_regex(
- r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+ r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
formats.append({
'format_id': 'sd',
- 'url': stream_url,
+ 'url': compat_urllib_parse_unquote(stream_url),
'ext': ext,
})
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py
index 7d4b57056..1d391e69f 100644
--- a/youtube_dl/extractor/kickstarter.py
+++ b/youtube_dl/extractor/kickstarter.py
@@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor):
'uploader': 'Pebble Technology',
'title': 'Pebble iOS Notifications',
}
+ }, {
+ 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+ 'info_dict': {
+ 'id': '1420158244',
+ 'ext': 'mp4',
+ 'title': 'Power Drive 2000',
+ },
+ 'expected_warnings': ['OpenGraph description'],
}]
def _real_extract(self, url):
@@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor):
'title': title,
}
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
+ if thumbnail is None:
+ thumbnail = self._html_search_regex(
+ r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+ webpage, 'thumbnail image', fatal=False)
return {
'id': video_id,
'url': video_url,
'title': title,
'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 35822067f..857edfde2 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor):
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
}
+ }, {
+ # Covers https://github.com/rg3/youtube-dl/pull/5983
+ 'url': 'http://www.liveleak.com/view?i=801_1409392012',
+ 'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+ 'info_dict': {
+ 'id': '801_1409392012',
+ 'ext': 'mp4',
+ 'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+ 'uploader': 'bony333',
+ 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+ }
}]
def _real_extract(self, url):
@@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor):
'url': s['file'],
} for i, s in enumerate(sources)]
for i, s in enumerate(sources):
- orig_url = s['file'].replace('.h264_base.mp4', '')
+ # Removing '.h264_*.mp4' gives the raw video, which is essentially
+ # the same video without the LiveLeak logo at the top (see
+ # https://github.com/rg3/youtube-dl/pull/4768)
+ orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
if s['file'] != orig_url:
formats.append({
'format_id': 'original-%s' % i,
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index 2684dd250..dc54634a5 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -19,7 +19,7 @@ class NFLIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
(?:.+?/)*
- (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+ (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
_TESTS = [
{
'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
@@ -58,6 +58,10 @@ class NFLIE(InfoExtractor):
'upload_date': '20150202',
},
},
+ {
+ 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+ 'only_matching': True,
+ }
]
@staticmethod
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 3cecebf95..0f8aa5ada 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -182,7 +182,6 @@ class NiconicoIE(InfoExtractor):
extension = xpath_text(video_info, './/movie_type')
if not extension:
extension = determine_ext(video_real_url)
- video_format = extension.upper()
thumbnail = (
xpath_text(video_info, './/thumbnail_url') or
@@ -241,7 +240,7 @@ class NiconicoIE(InfoExtractor):
'url': video_real_url,
'title': title,
'ext': extension,
- 'format': video_format,
+ 'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
'thumbnail': thumbnail,
'description': description,
'uploader': uploader,
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index 664dc81d4..5bbd2dcf6 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -166,6 +166,10 @@ class NocoIE(InfoExtractor):
self._sort_formats(formats)
timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+ if timestamp is not None and timestamp < 0:
+ timestamp = None
+
uploader = show.get('partner_name')
uploader_id = show.get('partner_key')
duration = float_or_none(show.get('duration_ms'), 1000)
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644
index 000000000..3f9c776ef
--- /dev/null
+++ b/youtube_dl/extractor/nova.py
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+ IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+ _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+ _TESTS = [{
+ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+ 'info_dict': {
+ 'id': '1608920',
+ 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+ 'ext': 'flv',
+ 'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+ 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+ 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+ 'info_dict': {
+ 'id': '1757139',
+ 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+ 'ext': 'mp4',
+ 'title': 'Podzemní nemocnice v pražské Krči',
+ 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'info_dict': {
+ 'id': '1756825',
+ 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'ext': 'flv',
+ 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+ 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+ 'info_dict': {
+ 'id': '1756858',
+ 'ext': 'flv',
+ 'title': 'Televizní noviny - 30. 5. 2015',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ 'upload_date': '20150530',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'info_dict': {
+ 'id': '1753621',
+ 'ext': 'mp4',
+ 'title': 'Zaklínač 3: Divoký hon',
+ 'description': 're:.*Pokud se stejně jako my nemůžete.*',
+ 'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+ 'upload_date': '20150521',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ site = mobj.group('site')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r"(?:media|video_id)\s*:\s*'(\d+)'",
+ r'media=(\d+)',
+ r'id="article_video_(\d+)"',
+ r'id="player_(\d+)"'],
+ webpage, 'video id')
+
+ config_url = self._search_regex(
+ r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+ webpage, 'config url', default=None)
+
+ if not config_url:
+ DEFAULT_SITE_ID = '23000'
+ SITES = {
+ 'tvnoviny': DEFAULT_SITE_ID,
+ 'novaplus': DEFAULT_SITE_ID,
+ 'vymena': DEFAULT_SITE_ID,
+ 'krasna': DEFAULT_SITE_ID,
+ 'fanda': '30',
+ 'tn': '30',
+ 'doma': '30',
+ }
+
+ site_id = self._search_regex(
+ r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+ config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+ % (site_id, video_id))
+
+ config = self._download_json(
+ config_url, display_id,
+ 'Downloading config JSON',
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+ mediafile = config['mediafile']
+ video_url = mediafile['src']
+
+ m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+ if m:
+ formats = [{
+ 'url': m.group('url'),
+ 'app': m.group('app'),
+ 'play_path': m.group('playpath'),
+ 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+ 'ext': 'flv',
+ }]
+ else:
+ formats = [{
+ 'url': video_url,
+ }]
+ self._sort_formats(formats)
+
+ title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+ description = clean_html(self._og_search_description(webpage, default=None))
+ thumbnail = config.get('poster')
+
+ if site == 'novaplus':
+ upload_date = unified_strdate(self._search_regex(
+ r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+ elif site == 'fanda':
+ upload_date = unified_strdate(self._search_regex(
+ r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+ else:
+ upload_date = None
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644
index 000000000..173e46cd8
--- /dev/null
+++ b/youtube_dl/extractor/nowtv.py
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ parse_duration,
+ remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+ _TESTS = [{
+ # rtl
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+ 'info_dict': {
+ 'id': '203519',
+ 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'ext': 'mp4',
+ 'title': 'Die neuen Bauern und eine Hochzeit',
+ 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432580700,
+ 'upload_date': '20150525',
+ 'duration': 2786,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # rtl2
+ 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+ 'info_dict': {
+ 'id': '203481',
+ 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+ 'ext': 'mp4',
+ 'title': 'Berlin - Tag & Nacht (Folge 934)',
+ 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432666800,
+ 'upload_date': '20150526',
+ 'duration': 2641,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # rtlnitro
+ 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+ 'info_dict': {
+ 'id': '165780',
+ 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+ 'ext': 'mp4',
+ 'title': 'Hals- und Beinbruch',
+ 'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432415400,
+ 'upload_date': '20150523',
+ 'duration': 2742,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # superrtl
+ 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+ 'info_dict': {
+ 'id': '99205',
+ 'display_id': 'medicopter-117/angst',
+ 'ext': 'mp4',
+ 'title': 'Angst!',
+ 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1222632900,
+ 'upload_date': '20080928',
+ 'duration': 3025,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # ntv
+ 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+ 'info_dict': {
+ 'id': '203521',
+ 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+ 'ext': 'mp4',
+ 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+ 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432751700,
+ 'upload_date': '20150527',
+ 'duration': 1083,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # vox
+ 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+ 'info_dict': {
+ 'id': '128953',
+ 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+ 'ext': 'mp4',
+ 'title': "Büro-Fall / Chihuahua 'Joel'",
+ 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432408200,
+ 'upload_date': '20150523',
+ 'duration': 3092,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ station = mobj.group('station')
+
+ info = self._download_json(
+ 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id,
+ display_id)
+
+ video_id = compat_str(info['id'])
+
+ files = info['files']
+ if not files:
+ if info.get('geoblocked', False):
+ raise ExtractorError(
+ 'Video %s is not available from your location due to geo restriction' % video_id,
+ expected=True)
+ if not info.get('free', True):
+ raise ExtractorError(
+ 'Video %s is not available for free' % video_id, expected=True)
+
+ f = info.get('format', {})
+ station = f.get('station') or station
+
+ STATIONS = {
+ 'rtl': 'rtlnow',
+ 'rtl2': 'rtl2now',
+ 'vox': 'voxnow',
+ 'nitro': 'rtlnitronow',
+ 'ntv': 'n-tvnow',
+ 'superrtl': 'superrtlnow'
+ }
+
+ formats = []
+ for item in files['items']:
+ item_path = remove_start(item['path'], '/')
+ tbr = int_or_none(item['bitrate'])
+ m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+ m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+ formats.append({
+ 'url': m3u8_url,
+ 'format_id': '%s-%sk' % (item['id'], tbr),
+ 'ext': 'mp4',
+ 'tbr': tbr,
+ })
+ self._sort_formats(formats)
+
+ title = info['title']
+ description = info.get('articleLong') or info.get('articleShort')
+ timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+ duration = parse_duration(info.get('duration'))
+ thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index f179ea200..6cdc2638b 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
embed = self._html_search_regex(
- r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+ r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
webpage, 'embedded URL', default=None)
if attach_fn is not None:
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644
index 000000000..72d1b2718
--- /dev/null
+++ b/youtube_dl/extractor/porn91.py
@@ -0,0 +1,71 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+ ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+ IE_NAME = '91porn'
+ _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+ _TEST = {
+ 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+ 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'info_dict': {
+ 'id': '7e42283b4f5ab36da134',
+ 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+ 'ext': 'mp4',
+ 'duration': 431,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+ self._set_cookie('91porn.com', 'language', 'cn_CN')
+ webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+ if '作为游客,你每天只可观看10个视频' in webpage:
+ raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+ title = self._search_regex(
+ r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+ title = title.replace('\n', '')
+
+ # get real url
+ file_id = self._search_regex(
+ r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+ sec_code = self._search_regex(
+ r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+ max_vid = self._search_regex(
+ r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+ url_params = compat_urllib_parse.urlencode({
+ 'VID': file_id,
+ 'mp4': '1',
+ 'seccode': sec_code,
+ 'max_vid': max_vid,
+ })
+ info_cn = self._download_webpage(
+ 'http://91porn.com/getfile.php?' + url_params, video_id,
+ 'get real video url')
+ video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+ duration = parse_duration(self._search_regex(
+ r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+ comment_count = int_or_none(self._search_regex(
+ r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'comment_count': comment_count,
+ }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index daa284ea2..8565d7551 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -19,7 +19,7 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-f]+)'
_TEST = {
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': '882f488fa1f0026f023f33576004a2ed',
@@ -32,6 +32,13 @@ class PornHubIE(InfoExtractor):
}
}
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
def _extract_count(self, pattern, webpage, name):
return str_to_int(self._search_regex(
pattern, webpage, '%s count' % name, fatal=False))
@@ -39,7 +46,8 @@ class PornHubIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = compat_urllib_request.Request(url)
+ req = compat_urllib_request.Request(
+ 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py
index 9688ed948..eba4dfbb3 100644
--- a/youtube_dl/extractor/pornovoisines.py
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):
'duration': 120,
'view_count': int,
'average_rating': float,
- 'categories': ['Débutante', 'Scénario', 'Sodomie'],
+ 'categories': ['Débutantes', 'Scénario', 'Sodomie'],
'age_limit': 18,
}
}
@@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):
view_count = int_or_none(self._search_regex(
r'(\d+) vues', webpage, 'view count', fatal=False))
average_rating = self._search_regex(
- r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False)
+ r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
if average_rating:
average_rating = float_or_none(average_rating.replace(',', '.'))
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 255d4abc1..536a42dc8 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -177,6 +177,7 @@ class ProSiebenSat1IE(InfoExtractor):
r'<header class="clearfix">\s*<h3>(.+?)</h3>',
r'<!-- start video -->\s*<h1>(.+?)</h1>',
r'<h1 class="att-name">\s*(.+?)</h1>',
+ r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
@@ -206,8 +207,8 @@ class ProSiebenSat1IE(InfoExtractor):
def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
- access_token = 'testclient'
- client_name = 'kolibri-1.2.5'
+ access_token = 'prosieben'
+ client_name = 'kolibri-1.12.6'
client_location = url
videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -275,13 +276,17 @@ class ProSiebenSat1IE(InfoExtractor):
for source in urls_sources:
protocol = source['protocol']
if protocol == 'rtmp' or protocol == 'rtmpe':
- mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+ mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source['url'])
if not mobj:
continue
+ path = mobj.group('path')
+ mp4colon_index = path.rfind('mp4:')
+ app = path[:mp4colon_index]
+ play_path = path[mp4colon_index:]
formats.append({
- 'url': mobj.group('url'),
- 'app': mobj.group('app'),
- 'play_path': mobj.group('playpath'),
+ 'url': '%s/%s' % (mobj.group('url'), app),
+ 'app': app,
+ 'play_path': play_path,
'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
'page_url': 'http://www.prosieben.de',
'vbr': fix_bitrate(source['bitrate']),
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index b540033e2..bafa81c21 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -9,7 +9,6 @@ from .common import InfoExtractor
from ..utils import (
strip_jsonp,
unescapeHTML,
- js_to_json,
)
from ..compat import compat_urllib_request
@@ -19,10 +18,10 @@ class QQMusicIE(InfoExtractor):
_VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
- 'md5': 'bed90b6db2a7a7a7e11bc585f471f63a',
+ 'md5': '9ce1c1c8445f561506d2e3cfb0255705',
'info_dict': {
'id': '004295Et37taLD',
- 'ext': 'm4a',
+ 'ext': 'mp3',
'title': '可惜没如果',
'upload_date': '20141227',
'creator': '林俊杰',
@@ -30,6 +29,12 @@ class QQMusicIE(InfoExtractor):
}
}]
+ _FORMATS = {
+ 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+ 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+ 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+ }
+
# Reference: m_r_GetRUin() in top_player.js
# http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
@staticmethod
@@ -69,11 +74,22 @@ class QQMusicIE(InfoExtractor):
'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
mid, note='Retrieve vkey', errnote='Unable to get vkey',
transform_source=strip_jsonp)['key']
- song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid)
+
+ formats = []
+ for format_id, details in self._FORMATS.items():
+ formats.append({
+ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+ % (details['prefix'], mid, details['ext'], vkey, guid),
+ 'format': format_id,
+ 'format_id': format_id,
+ 'preference': details['preference'],
+ 'abr': details.get('abr'),
+ })
+ self._sort_formats(formats)
return {
'id': mid,
- 'url': song_url,
+ 'formats': formats,
'title': song_name,
'upload_date': publish_time,
'creator': singer,
@@ -181,60 +197,49 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
_VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=toplist&p=global_12',
+ 'url': 'http://y.qq.com/#type=toplist&p=global_123',
'info_dict': {
- 'id': 'global_12',
- 'title': 'itunes榜',
+ 'id': 'global_123',
+ 'title': '美国iTunes榜',
},
'playlist_count': 10,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=top_6',
+ 'url': 'http://y.qq.com/#type=toplist&p=top_3',
'info_dict': {
- 'id': 'top_6',
+ 'id': 'top_3',
'title': 'QQ音乐巅峰榜·欧美',
+ 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+ '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+ '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+ '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
},
'playlist_count': 100,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=global_5',
+ 'url': 'http://y.qq.com/#type=toplist&p=global_106',
'info_dict': {
- 'id': 'global_5',
- 'title': '韩国mnet排行榜',
+ 'id': 'global_106',
+ 'title': '韩国Mnet榜',
},
'playlist_count': 50,
}]
- @staticmethod
- def strip_qq_jsonp(code):
- return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code))
-
def _real_extract(self, url):
list_id = self._match_id(url)
list_type, num_id = list_id.split("_")
- list_page = self._download_webpage(
- "http://y.qq.com/y/static/toplist/index/%s.html" % list_id,
- list_id, 'Download toplist page')
-
- entries = []
- if list_type == 'top':
- jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id
- else:
- jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id
-
toplist_json = self._download_json(
- jsonp_url, list_id, note='Retrieve toplist json',
- errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp)
-
- for song in toplist_json['l']:
- s = song['s']
- song_mid = s.split("|")[20]
- entries.append(self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
- song_mid))
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+ % (list_type, num_id),
+ list_id, 'Download toplist page')
- list_name = self._html_search_regex(
- r'<h2 id="top_name">([^\']+)</h2>', list_page, 'top list name',
- default=None)
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+ ) for song in toplist_json['songlist']
+ ]
- return self.playlist_result(entries, list_id, list_name)
+ topinfo = toplist_json.get('topinfo', {})
+ list_name = topinfo.get('ListName')
+ list_description = topinfo.get('info')
+ return self.playlist_result(entries, list_id, list_name, list_description)
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
index 5a381d9ce..e4215d546 100644
--- a/youtube_dl/extractor/rtbf.py
+++ b/youtube_dl/extractor/rtbf.py
@@ -21,6 +21,13 @@ class RTBFIE(InfoExtractor):
}
}
+ _QUALITIES = [
+ ('mobile', 'mobile'),
+ ('web', 'SD'),
+ ('url', 'MD'),
+ ('high', 'HD'),
+ ]
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -32,14 +39,21 @@ class RTBFIE(InfoExtractor):
r'data-video="([^"]+)"', webpage, 'data video')),
video_id)
- video_url = data.get('downloadUrl') or data.get('url')
-
if data.get('provider').lower() == 'youtube':
+ video_url = data.get('downloadUrl') or data.get('url')
return self.url_result(video_url, 'Youtube')
+ formats = []
+ for key, format_id in self._QUALITIES:
+ format_url = data['sources'].get(key)
+ if format_url:
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ })
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': data['title'],
'description': data.get('description') or data.get('subtitle'),
'thumbnail': data.get('thumbnail'),
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index cfce4550a..41d202c28 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor):
IE_NAME = 'rtl.nl'
IE_DESC = 'rtl.nl and rtlxl.nl'
_VALID_URL = r'''(?x)
- https?://(www\.)?
+ https?://(?:www\.)?
(?:
rtlxl\.nl/\#!/[^/]+/|
- rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid=
+ rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
)
(?P<id>[0-9a-f-]+)'''
@@ -43,6 +43,9 @@ class RtlNlIE(InfoExtractor):
'upload_date': '20150215',
'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
}
+ }, {
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644
index 785a8045e..000000000
--- a/youtube_dl/extractor/rtlnow.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- clean_html,
- unified_strdate,
- int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
- """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'''(?x)
- (?:https?://)?
- (?P<url>
- (?P<domain>
- rtl-now\.rtl\.de|
- rtl2now\.rtl2\.de|
- (?:www\.)?voxnow\.de|
- (?:www\.)?rtlnitronow\.de|
- (?:www\.)?superrtlnow\.de|
- (?:www\.)?n-tvnow\.de)
- /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
- (?:container_id|film_id)=(?P<video_id>[0-9]+)&
- player=1(?:&season=[0-9]+)?(?:&.*)?
- )'''
-
- _TESTS = [
- {
- 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
- 'info_dict': {
- 'id': '90419',
- 'ext': 'flv',
- 'title': 'Ahornallee - Folge 1 - Der Einzug',
- 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
- 'upload_date': '20070416',
- 'duration': 1685,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
- 'info_dict': {
- 'id': '69756',
- 'ext': 'flv',
- 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
- 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
- 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
- 'upload_date': '20120519',
- 'duration': 1245,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
- 'info_dict': {
- 'id': '13883',
- 'ext': 'flv',
- 'title': 'Voxtours - Südafrika-Reporter II',
- 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
- 'upload_date': '20090627',
- 'duration': 1800,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
- 'info_dict': {
- 'id': '99205',
- 'ext': 'flv',
- 'title': 'Medicopter 117 - Angst!',
- 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
- 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
- 'upload_date': '20080928',
- 'duration': 2691,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
- 'info_dict': {
- 'id': '188729',
- 'ext': 'flv',
- 'upload_date': '20150204',
- 'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
- 'title': 'Der Bachelor - Folge 4',
- }
- }, {
- 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
- 'only_matching': True,
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_page_url = 'http://%s/' % mobj.group('domain')
- video_id = mobj.group('video_id')
-
- webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
- mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
- if mobj:
- raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage, default=None)
-
- upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
- mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
- duration = int(mobj.group('seconds')) if mobj else None
-
- playerdata_url = self._html_search_regex(
- r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
- playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
- videoinfo = playerdata.find('./playlist/videoinfo')
-
- formats = []
- for filename in videoinfo.findall('filename'):
- mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
- if mobj:
- fmt = {
- 'url': mobj.group('url'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': video_page_url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
- if mobj:
- fmt = {
- 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- fmt = {
- 'url': filename.text,
- }
- fmt.update({
- 'width': int_or_none(filename.get('width')),
- 'height': int_or_none(filename.get('height')),
- 'vbr': int_or_none(filename.get('bitrate')),
- 'ext': 'flv',
- })
- formats.append(fmt)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
new file mode 100644
index 000000000..4e22628d0
--- /dev/null
+++ b/youtube_dl/extractor/ruutu.py
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+ 'md5': 'ab2093f39be1ca8581963451b3c0234f',
+ 'info_dict': {
+ 'id': '2058907',
+ 'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+ 'ext': 'mp4',
+ 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+ 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 114,
+ 'age_limit': 0,
+ },
+ },
+ {
+ 'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa',
+ 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+ 'info_dict': {
+ 'id': '2057306',
+ 'display_id': 'superpesis-katso-koko-kausi-ruudussa',
+ 'ext': 'mp4',
+ 'title': 'Superpesis: katso koko kausi Ruudussa',
+ 'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 40,
+ 'age_limit': 0,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r'data-media-id="(\d+)"', webpage, 'media id')
+
+ video_xml_url = None
+
+ media_data = self._search_regex(
+ r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage,
+ 'media data', default=None)
+ if media_data:
+ media_json = self._parse_json(media_data, display_id, fatal=False)
+ if media_json:
+ xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl')
+ if xml_url:
+ video_xml_url = xml_url.replace('{ID}', video_id)
+
+ if not video_xml_url:
+ video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id
+
+ video_xml = self._download_xml(video_xml_url, video_id)
+
+ formats = []
+ processed_urls = []
+
+ def extract_formats(node):
+ for child in node:
+ if child.tag.endswith('Files'):
+ extract_formats(child)
+ elif child.tag.endswith('File'):
+ video_url = child.text
+ if not video_url or video_url in processed_urls or 'NOT_USED' in video_url:
+ return
+ processed_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls'))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds'))
+ else:
+ proto = compat_urllib_parse_urlparse(video_url).scheme
+ if not child.tag.startswith('HTTP') and proto != 'rtmp':
+ continue
+ preference = -1 if proto == 'rtmp' else 1
+ label = child.get('label')
+ tbr = int_or_none(child.get('bitrate'))
+ width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+ formats.append({
+ 'format_id': '%s-%s' % (proto, label if label else tbr),
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'preference': preference,
+ })
+
+ extract_formats(video_xml.find('./Clip'))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+ 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index d3b8a1be4..9c53704ea 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
["arch", "", "http://ussenate-f.akamaihd.net/"]
]
_IE_NAME = 'senate.gov'
- _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+ _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
'info_dict': {
@@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):
'ext': 'mp4',
'title': 'Integrated Senate Video Player'
}
+ }, {
+ # From http://www.c-span.org/video/?96791-1
+ 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+ 'only_matching': True,
}]
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
- r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
+ r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644
index 000000000..5da66ca9e
--- /dev/null
+++ b/youtube_dl/extractor/soompi.py
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ remove_start,
+ xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+ def _get_episodes(self, webpage, episode_filter=None):
+ episodes = self._parse_json(
+ self._search_regex(
+ r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+ None)
+ return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+ IE_NAME = 'soompi'
+ _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/watch/29235',
+ 'info_dict': {
+ 'id': '29235',
+ 'ext': 'mp4',
+ 'title': 'Episode 1096',
+ 'description': '2015-05-20'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _get_episode(self, webpage, video_id):
+ return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+ def _get_subtitles(self, config, video_id):
+ sub_langs = {}
+ for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+ sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+ subtitles = {}
+ for s in config.findall('./{default}preload/subtitle'):
+ lang_code = sub_langs.get(s.attrib['id'])
+ if not lang_code:
+ continue
+ sub_id = s.get('id')
+ data = xpath_text(s, './data', 'data')
+ iv = xpath_text(s, './iv', 'iv')
+ if not id or not iv or not data:
+ continue
+ subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ try:
+ webpage = self._download_webpage(
+ url, video_id, 'Downloading episode page')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ webpage = ee.cause.read()
+ block_message = self._html_search_regex(
+ r'(?s)<div class="block-message">(.+?)</div>', webpage,
+ 'block message', default=None)
+ if block_message:
+ raise ExtractorError(block_message, expected=True)
+ raise
+
+ formats = []
+ config = None
+ for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+ config = self._download_xml(
+ 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+ video_id, 'Downloading %s XML' % format_id)
+ m3u8_url = xpath_text(
+ config, './{default}preload/stream_info/file',
+ '%s m3u8 URL' % format_id)
+ if not m3u8_url:
+ continue
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+ self._sort_formats(formats)
+
+ episode = self._get_episode(webpage, video_id)
+
+ title = episode['name']
+ description = episode.get('description')
+ duration = int_or_none(episode.get('duration'))
+
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+ subtitles = self.extract_subtitles(config, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class SoompiShowIE(SoompiBaseIE):
+ IE_NAME = 'soompi:show'
+ _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/shows/liar-game',
+ 'info_dict': {
+ 'id': 'liar-game',
+ 'title': 'Liar Game',
+ 'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+ },
+ 'playlist_count': 14,
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ url, show_id, 'Downloading show page')
+
+ title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+ description = self._og_search_description(webpage)
+
+ entries = [
+ self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+ for episode in self._get_episodes(webpage)]
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index 98cf92d89..08a5c4314 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -2,7 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ determine_ext,
+ float_or_none,
+)
class SpiegeltvIE(InfoExtractor):
@@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg$',
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
}
}, {
@@ -51,9 +55,37 @@ class SpiegeltvIE(InfoExtractor):
is_wide = media_json['is_wide']
server_json = self._download_json(
- 'http://www.spiegel.tv/streaming_servers/', video_id,
- note='Downloading server information')
- server = server_json[0]['endpoint']
+ 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+ video_id, note='Downloading server information')
+
+ format = '16x9' if is_wide else '4x3'
+
+ formats = []
+ for streamingserver in server_json['streamingserver']:
+ endpoint = streamingserver.get('endpoint')
+ if not endpoint:
+ continue
+ play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
+ if endpoint.startswith('rtmp'):
+ formats.append({
+ 'url': endpoint,
+ 'format_id': 'rtmp',
+ 'app': compat_urllib_parse_urlparse(endpoint).path[1:],
+ 'play_path': play_path,
+ 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
+ 'ext': 'flv',
+ 'rtmp_live': True,
+ })
+ elif determine_ext(endpoint) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ endpoint.replace('[video]', play_path),
+ video_id, 'm4v',
+ preference=1, # Prefer hls since it allows to workaround georestriction
+ m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': endpoint,
+ })
thumbnails = []
for image in media_json['images']:
@@ -65,16 +97,12 @@ class SpiegeltvIE(InfoExtractor):
description = media_json['subtitle']
duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
- format = '16x9' if is_wide else '4x3'
-
- url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
return {
'id': video_id,
'title': title,
- 'url': url,
- 'ext': 'm4v',
'description': description,
'duration': duration,
- 'thumbnails': thumbnails
+ 'thumbnails': thumbnails,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
index 854d01bee..e527aa971 100644
--- a/youtube_dl/extractor/sunporno.py
+++ b/youtube_dl/extractor/sunporno.py
@@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor):
webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
- r'class="views">\s*(\d+)\s*<',
+ r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._html_search_regex(
r'(\d+)</b> Comments?',
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index b2a4b1fc0..d1b7264b4 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -51,6 +51,17 @@ class TeamcocoIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 downloads
}
+ }, {
+ 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+ 'info_dict': {
+ 'id': '89341',
+ 'ext': 'mp4',
+ 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 downloads
+ }
}
]
_VIDEO_ID_REGEXES = (
@@ -110,9 +121,23 @@ class TeamcocoIE(InfoExtractor):
get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
for filed in data['files']:
if determine_ext(filed['url']) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- filed['url'], video_id, ext='mp4'))
+ # compat_urllib_parse.urljoin does not work here
+ if filed['url'].startswith('/'):
+ m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+ else:
+ m3u8_url = filed['url']
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4')
+ for m3u8_format in m3u8_formats:
+ if m3u8_format not in formats:
+ formats.append(m3u8_format)
+ elif determine_ext(filed['url']) == 'f4m':
+ # TODO Correct f4m extraction
+ continue
else:
+ if filed['url'].startswith('/mp4:protected/'):
+ # TODO Correct extraction for these files
+ continue
m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
if m_format is not None:
format_id = m_format.group(1)
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 656410528..3a68eaa80 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+ _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
_TESTS = [{
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
@@ -35,6 +35,9 @@ class TF1IE(InfoExtractor):
}, {
'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
'only_matching': True,
+ }, {
+ 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 92731ad3d..83d833e30 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -26,7 +26,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
class ThePlatformIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
- (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
+ (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
_TESTS = [{
@@ -56,6 +56,17 @@ class ThePlatformIE(InfoExtractor):
# rtmp download
'skip_download': True,
}
+ }, {
+ 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+ 'info_dict': {
+ 'id': 'yMBg9E8KFxZD',
+ 'ext': 'mp4',
+ 'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+ 'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+ }
+ }, {
+ 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+ 'only_matching': True,
}]
@staticmethod
@@ -85,6 +96,11 @@ class ThePlatformIE(InfoExtractor):
if not provider_id:
provider_id = 'dJ5BDC'
+ path = provider_id
+ if mobj.group('media'):
+ path += '/media'
+ path += '/' + video_id
+
if smuggled_data.get('force_smil_url', False):
smil_url = url
elif mobj.group('config'):
@@ -94,8 +110,7 @@ class ThePlatformIE(InfoExtractor):
config = self._download_json(config_url, video_id, 'Downloading config')
smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
else:
- smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?'
- 'format=smil&mbr=true'.format(provider_id, video_id))
+ smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
sig = smuggled_data.get('sig')
if sig:
@@ -112,7 +127,7 @@ class ThePlatformIE(InfoExtractor):
else:
raise ExtractorError(error_msg, expected=True)
- info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id)
+ info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
info_json = self._download_webpage(info_url, video_id)
info = json.loads(info_json)
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index 9f9e388c5..13263614c 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE):
IE_NAME = 'tlc.com'
_VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
- _TEST = {
+ # DiscoveryIE has _TESTS
+ _TESTS = [{
'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
- 'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
'info_dict': {
- 'id': '853232',
+ 'id': '104493',
'ext': 'mp4',
- 'title': 'Cake Boss: Too Big to Fly',
+ 'title': 'Too Big to Fly',
'description': 'Buddy has taken on a high flying task.',
'duration': 119,
+ 'timestamp': 1393365060,
+ 'upload_date': '20140225',
},
- }
+ 'params': {
+ 'skip_download': True, # requires ffmpef
+ },
+ }]
class TlcDeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index 59af9aba0..c282865b2 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -33,7 +33,7 @@ class TNAFlixIE(InfoExtractor):
},
{
'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
- 'matching_only': True,
+ 'only_matching': True,
}
]
@@ -51,9 +51,8 @@ class TNAFlixIE(InfoExtractor):
age_limit = self._rta_search(webpage)
- duration = self._html_search_meta('duration', webpage, 'duration', default=None)
- if duration:
- duration = parse_duration(duration[1:])
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', default=None))
cfg_url = self._proto_relative_url(self._html_search_regex(
self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
@@ -62,14 +61,15 @@ class TNAFlixIE(InfoExtractor):
cfg_url, display_id, note='Downloading metadata',
transform_source=fix_xml_ampersands)
- thumbnail = cfg_xml.find('./startThumb').text
+ thumbnail = self._proto_relative_url(
+ cfg_xml.find('./startThumb').text, 'http:')
formats = []
for item in cfg_xml.findall('./quality/item'):
video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
format_id = item.find('res').text
fmt = {
- 'url': video_url,
+ 'url': self._proto_relative_url(video_url, 'http:'),
'format_id': format_id,
}
m = re.search(r'^(\d+)', format_id)
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index d73ad3762..c9cb69333 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
webpage = self._download_webpage(req, display_id)
flashvars = json.loads(self._html_search_regex(
- r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+ r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'))
video_url = flashvars['video_url']
if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
thumbnail = flashvars.get('image_url')
title = self._html_search_regex(
- r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+ r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
description = self._html_search_regex(
- r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+ r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
uploader = self._html_search_regex(
- r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+ r'<span class="username">\s*(.+?)\s*<',
webpage, 'uploader', fatal=False)
like_count = int_or_none(self._html_search_regex(
- r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+ r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
dislike_count = int_or_none(self._html_search_regex(
- r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+ r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
view_count = self._html_search_regex(
- r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+ r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644
index 000000000..2c4b21807
--- /dev/null
+++ b/youtube_dl/extractor/tubitv.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+ _LOGIN_URL = 'http://tubitv.com/login'
+ _NETRC_MACHINE = 'tubitv'
+ _TEST = {
+ 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+ 'info_dict': {
+ 'id': '54411',
+ 'ext': 'mp4',
+ 'title': 'The Kitchen Musical - EP01',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'description': 'md5:37532716166069b353e8866e71fefae7',
+ 'duration': 2407,
+ },
+ 'params': {
+ 'skip_download': 'HLS download',
+ },
+ }
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ form_data = {
+ 'username': username,
+ 'password': password,
+ }
+ payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+ request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ login_page = self._download_webpage(
+ request, None, False, 'Wrong login info')
+ if not re.search(r'id="tubi-logout"', login_page):
+ raise ExtractorError(
+ 'Login failed (invalid username/password)', expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+ raise ExtractorError(
+ 'This video requires login, use --username and --password '
+ 'options to provide account credentials.', expected=True)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+ m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 828c808a6..63c20310d 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .pornhub import PornHubIE
class TumblrIE(InfoExtractor):
@@ -28,6 +29,17 @@ class TumblrIE(InfoExtractor):
'description': 'md5:dba62ac8639482759c8eb10ce474586a',
'thumbnail': 're:http://.*\.jpg',
}
+ }, {
+ 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+ 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+ 'info_dict': {
+ 'id': 'Wmur',
+ 'ext': 'mp4',
+ 'title': 'naked smoking & stretching',
+ 'upload_date': '20150506',
+ 'timestamp': 1430931613,
+ },
+ 'add_ie': ['Vidme'],
}]
def _real_extract(self, url):
@@ -38,6 +50,16 @@ class TumblrIE(InfoExtractor):
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
webpage = self._download_webpage(url, video_id)
+ vid_me_embed_url = self._search_regex(
+ r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+ webpage, 'vid.me embed', default=None)
+ if vid_me_embed_url is not None:
+ return self.url_result(vid_me_embed_url, 'Vidme')
+
+ pornhub_url = PornHubIE._extract_url(webpage)
+ if pornhub_url:
+ return self.url_result(pornhub_url, 'PornHub')
+
iframe_url = self._search_regex(
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
webpage, 'iframe url')
diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py
index 29703a8a9..7ae63a499 100644
--- a/youtube_dl/extractor/turbo.py
+++ b/youtube_dl/extractor/turbo.py
@@ -23,7 +23,7 @@ class TurboIE(InfoExtractor):
'ext': 'mp4',
'duration': 3715,
'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
- 'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+ 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
'thumbnail': 're:^https?://.*\.jpg$',
}
}
@@ -42,7 +42,7 @@ class TurboIE(InfoExtractor):
title = xpath_text(item, './title', 'title')
duration = int_or_none(xpath_text(item, './durate', 'duration'))
thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
- description = self._og_search_description(webpage)
+ description = self._html_search_meta('description', webpage)
formats = []
get_quality = qualities(['3g', 'sd', 'hq'])
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
new file mode 100644
index 000000000..3a4f393fc
--- /dev/null
+++ b/youtube_dl/extractor/tvc.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+ 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+ 'info_dict': {
+ 'id': '74622',
+ 'ext': 'mp4',
+ 'title': 'События. "События". Эфир от 22.05.2015 14:30',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1122,
+ },
+ }
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+ formats = []
+ for info in video.get('path', {}).get('quality', []):
+ video_url = info.get('url')
+ if not video_url:
+ continue
+ format_id = self._search_regex(
+ r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+ 'format id', default=None)
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'width': int_or_none(info.get('width')),
+ 'height': int_or_none(info.get('height')),
+ 'tbr': int_or_none(info.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video['title'],
+ 'thumbnail': video.get('picture'),
+ 'duration': int_or_none(video.get('duration')),
+ 'formats': formats,
+ }
+
+
+class TVCArticleIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+ 'info_dict': {
+ 'id': '74622',
+ 'ext': 'mp4',
+ 'title': 'События. "События". Эфир от 22.05.2015 14:30',
+ 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1122,
+ },
+ }, {
+ 'url': 'http://www.tvc.ru/news/show/id/69944',
+ 'info_dict': {
+ 'id': '75399',
+ 'ext': 'mp4',
+ 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+ 'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 278,
+ },
+ }, {
+ 'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+ 'info_dict': {
+ 'id': '2185',
+ 'ext': 'mp4',
+ 'title': 'Ещё не поздно. Эфир от 03.08.2013',
+ 'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 3316,
+ },
+ }]
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, self._match_id(url))
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'TVC',
+ 'url': self._og_search_video_url(webpage),
+ 'title': clean_html(self._og_search_title(webpage)),
+ 'description': clean_html(self._og_search_description(webpage)),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index 102362b29..dc3a8334a 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -5,7 +5,9 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
float_or_none,
+ int_or_none,
parse_age_limit,
)
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
'display_id': 'sokrat',
'ext': 'flv',
'title': 'Сократ',
- 'description': 'md5:a05bd01be310074d5833efc6743be95e',
+ 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
'duration': 6586,
- 'age_limit': 0,
+ 'age_limit': 12,
},
+ 'skip': 'georestricted',
},
{
'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
- 'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+ 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '5142516',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'duration': 186.080,
'age_limit': 0,
},
+ 'skip': 'georestricted',
}, {
'url': 'https://cloud.tvigle.ru/video/5267604/',
'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
- r'<li class="video-preview current_playing" id="(\d+)">',
+ r'class="video-preview current_playing" id="(\d+)">',
webpage, 'video id')
video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
item = video_data['playlist']['items'][0]
+ videos = item.get('videos')
+
+ error_message = item.get('errorMessage')
+ if not videos and error_message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
title = item['title']
- description = item['description']
- thumbnail = item['thumbnail']
+ description = item.get('description')
+ thumbnail = item.get('thumbnail')
duration = float_or_none(item.get('durationMilliseconds'), 1000)
age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
for vcodec, fmts in item['videos'].items():
- for quality, video_url in fmts.items():
+ for format_id, video_url in fmts.items():
+ if format_id == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=vcodec))
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
formats.append({
'url': video_url,
- 'format_id': '%s-%s' % (vcodec, quality),
+ 'format_id': '%s-%s' % (vcodec, format_id),
'vcodec': vcodec,
- 'height': int(quality[:-1]),
- 'filesize': item['video_files_size'][vcodec][quality],
+ 'height': int_or_none(height),
+ 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index e83e31a31..79863e781 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor):
viasat4play\.no/programmer|
tv6play\.no/programmer|
tv3play\.dk/programmer|
+ play\.novatv\.bg/programi
)/[^/]+/(?P<id>\d+)
'''
_TESTS = [
@@ -173,6 +174,22 @@ class TVPlayIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+ 'info_dict': {
+ 'id': '624952',
+ 'ext': 'flv',
+ 'title': 'Здравей, България (12.06.2015 г.) ',
+ 'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+ 'duration': 8838,
+ 'timestamp': 1434100372,
+ 'upload_date': '20150612',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
index 67e8bfea0..c1ee1decc 100644
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.24video.net/video/view/1044982',
- 'md5': '48dd7646775690a80447a8dca6a2df76',
+ 'md5': 'd041af8b5b4246ea466226a0d6693345',
'info_dict': {
'id': '1044982',
'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
webpage, 'upload date'))
uploader = self._html_search_regex(
- r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+ r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
webpage, 'uploader', fatal=False)
view_count = int_or_none(self._html_search_regex(
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index dd026748d..722eb5236 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
@@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- redirect_page, urlh = self._download_webpage_handle(url, video_id)
- new_location = self._search_regex(r'window\.location = \'(.*)\';',
- redirect_page, 'redirect location')
- redirect_url = urlh.geturl() + new_location
- webpage = self._download_webpage(redirect_url, video_id,
+ # need to get the page 3 times for the correct jsSecretToken cookie
+ # which is necessary for the correct title
+ def get_session_id():
+ redirect_page = self._download_webpage(url, video_id)
+ session_id_url = self._search_regex(
+ r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
+ 'session id url')
+ self._download_webpage(
+ compat_urlparse.urljoin(url, session_id_url), video_id,
+ 'Getting session id')
+
+ get_session_id()
+ get_session_id()
+
+ webpage = self._download_webpage(url, video_id,
'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>',
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index e6ee1e471..f38a72fde 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+)
class VGTVIE(InfoExtractor):
@@ -59,16 +62,16 @@ class VGTVIE(InfoExtractor):
},
{
# streamType: live
- 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+ 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
'info_dict': {
- 'id': '100015',
+ 'id': '113063',
'ext': 'flv',
- 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
- 'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+ 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:b3743425765355855f88e096acc93231',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 0,
- 'timestamp': 1407423348,
- 'upload_date': '20140807',
+ 'timestamp': 1432975582,
+ 'upload_date': '20150530',
'view_count': int,
},
'params': {
@@ -97,7 +100,12 @@ class VGTVIE(InfoExtractor):
% (host, video_id, HOST_WEBSITES[host]),
video_id, 'Downloading media JSON')
+ if data.get('status') == 'inactive':
+ raise ExtractorError(
+ 'Video %s is no longer available' % video_id, expected=True)
+
streams = data['streamUrls']
+ stream_type = data.get('streamType')
formats = []
@@ -107,7 +115,8 @@ class VGTVIE(InfoExtractor):
hls_url, video_id, 'mp4', m3u8_id='hls'))
hds_url = streams.get('hds')
- if hds_url:
+ # wasLive hds are always 404
+ if hds_url and stream_type != 'wasLive':
formats.extend(self._extract_f4m_formats(
hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
video_id, f4m_id='hds'))
@@ -135,13 +144,14 @@ class VGTVIE(InfoExtractor):
return {
'id': video_id,
- 'title': data['title'],
+ 'title': self._live_title(data['title']),
'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'],
'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'],
'formats': formats,
+ 'is_live': True if stream_type == 'live' else False,
}
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index bd953fb4c..e0b55078b 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -10,7 +10,7 @@ from ..utils import (
class VidmeIE(InfoExtractor):
_VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://vid.me/QNB',
'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
'info_dict': {
@@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor):
'upload_date': '20140725',
'thumbnail': 're:^https?://.*\.jpg',
},
- }
+ }, {
+ # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+ 'url': 'https://vid.me/e/Wmur',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
+ url = url.replace('vid.me/e/', 'vid.me/')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index cc384adbf..d0e772108 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -119,8 +119,8 @@ class VKIE(InfoExtractor):
'act': 'login',
'role': 'al_frame',
'expire': '1',
- 'email': username,
- 'pass': password,
+ 'email': username.encode('cp1251'),
+ 'pass': password.encode('cp1251'),
}
request = compat_urllib_request.Request('https://login.vk.com/?act=login',
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 0301682b8..3448bec4f 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -785,7 +785,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
s = mobj.group(1)
dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
return '/signature/%s' % dec_s
- dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+ dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
dash_manifest_url, video_id,
note='Downloading DASH manifest',
@@ -1290,7 +1290,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _extract_playlist(self, playlist_id):
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
- more_widget_html = content_html = page
for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
match = match.strip()
@@ -1310,36 +1309,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
self.report_warning('Youtube gives an alert message: ' + match)
# Extract the video ids from the playlist pages
- ids = []
-
- for page_num in itertools.count(1):
- matches = re.finditer(self._VIDEO_RE, content_html)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- ids.extend(new_ids)
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+ def _entries():
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.finditer(self._VIDEO_RE, content_html)
+ # We remove the duplicates and the link with index 0
+ # (it's not the first video of the playlist)
+ new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+ for vid_id in new_ids:
+ yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex(
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
page, 'title')
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_id, playlist_title)
+ return self.playlist_result(_entries(), playlist_id, playlist_title)
def _real_extract(self, url):
# Extract playlist id
@@ -1399,6 +1398,24 @@ class YoutubeChannelIE(InfoExtractor):
channel_id = self._match_id(url)
url = self._TEMPLATE_URL % channel_id
+
+ # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+ # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+ # otherwise fallback on channel by page extraction
+ channel_page = self._download_webpage(
+ url + '?view=57', channel_id,
+ 'Downloading channel page', fatal=False)
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_playlist_id = self._search_regex(
+ r'data-channel-external-id="([^"]+)"',
+ channel_page, 'channel id', default=None)
+ if channel_playlist_id and channel_playlist_id.startswith('UC'):
+ playlist_id = 'UU' + channel_playlist_id[2:]
+ return self.url_result(
+ compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
autogenerated = re.search(r'''(?x)
class="[^"]*?(?:
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 5a2315bd9..740458e51 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -145,7 +145,7 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--list-extractors',
action='store_true', dest='list_extractors', default=False,
- help='List all supported extractors and the URLs they would handle')
+ help='List all supported extractors')
general.add_option(
'--extractor-descriptions',
action='store_true', dest='list_extractor_descriptions', default=False,
@@ -725,7 +725,7 @@ def parseOpts(overrideArguments=None):
metavar='POLICY', dest='fixup', default='detect_or_warn',
help='Automatically correct known faults of the file. '
'One of never (do nothing), warn (only emit a warning), '
- 'detect_or_warn(the default; fix file if we can, warn otherwise)')
+ 'detect_or_warn (the default; fix file if we can, warn otherwise)')
postproc.add_option(
'--prefer-avconv',
action='store_false', dest='prefer_ffmpeg',
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py
index 8f825f785..774494efd 100644
--- a/youtube_dl/postprocessor/embedthumbnail.py
+++ b/youtube_dl/postprocessor/embedthumbnail.py
@@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
- elif info['ext'] == 'm4a':
+ elif info['ext'] in ['m4a', 'mp4']:
if not check_executable('AtomicParsley', ['-v']):
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
@@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
else:
- raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.')
+ raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
return [], info
diff --git a/youtube_dl/update.py b/youtube_dl/update.py
index de3169eef..fc7ac8305 100644
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -50,7 +50,7 @@ def rsa_verify(message, signature, key):
def update_self(to_screen, verbose):
"""Update the program file with the latest version from the repository"""
- UPDATE_URL = "http://rg3.github.io/youtube-dl/update/"
+ UPDATE_URL = "https://rg3.github.io/youtube-dl/update/"
VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
JSON_URL = UPDATE_URL + 'versions.json'
UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index b33385153..34a13cb81 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2015.05.20'
+__version__ = '2015.06.15'