aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py35
-rw-r--r--youtube_dl/extractor/adultswim.py3
-rw-r--r--youtube_dl/extractor/aenetworks.py66
-rw-r--r--youtube_dl/extractor/amp.py1
-rw-r--r--youtube_dl/extractor/atresplayer.py5
-rw-r--r--youtube_dl/extractor/audimedia.py8
-rw-r--r--youtube_dl/extractor/baidu.py55
-rw-r--r--youtube_dl/extractor/bbc.py54
-rw-r--r--youtube_dl/extractor/beeg.py4
-rw-r--r--youtube_dl/extractor/bigflix.py85
-rw-r--r--youtube_dl/extractor/canalc2.py46
-rw-r--r--youtube_dl/extractor/canalplus.py59
-rw-r--r--youtube_dl/extractor/canvas.py65
-rw-r--r--youtube_dl/extractor/cbsnews.py4
-rw-r--r--youtube_dl/extractor/ccc.py35
-rw-r--r--youtube_dl/extractor/common.py66
-rw-r--r--youtube_dl/extractor/crunchyroll.py6
-rw-r--r--youtube_dl/extractor/cultureunplugged.py63
-rw-r--r--youtube_dl/extractor/cwtv.py88
-rw-r--r--youtube_dl/extractor/dailymotion.py13
-rw-r--r--youtube_dl/extractor/dcn.py8
-rw-r--r--youtube_dl/extractor/discovery.py39
-rw-r--r--youtube_dl/extractor/dramafever.py50
-rw-r--r--youtube_dl/extractor/dreisat.py59
-rw-r--r--youtube_dl/extractor/einthusan.py33
-rw-r--r--youtube_dl/extractor/espn.py37
-rw-r--r--youtube_dl/extractor/facebook.py4
-rw-r--r--youtube_dl/extractor/fox.py39
-rw-r--r--youtube_dl/extractor/generic.py18
-rw-r--r--youtube_dl/extractor/history.py31
-rw-r--r--youtube_dl/extractor/ign.py13
-rw-r--r--youtube_dl/extractor/iprima.py1
-rw-r--r--youtube_dl/extractor/ivi.py97
-rw-r--r--youtube_dl/extractor/ivideon.py83
-rw-r--r--youtube_dl/extractor/jukebox.py59
-rw-r--r--youtube_dl/extractor/mdr.py2
-rw-r--r--youtube_dl/extractor/mtv.py21
-rw-r--r--youtube_dl/extractor/nbc.py2
-rw-r--r--youtube_dl/extractor/nextmovie.py30
-rw-r--r--youtube_dl/extractor/nhl.py2
-rw-r--r--youtube_dl/extractor/nick.py63
-rw-r--r--youtube_dl/extractor/nowtv.py1
-rw-r--r--youtube_dl/extractor/npr.py82
-rw-r--r--youtube_dl/extractor/ntvde.py33
-rw-r--r--youtube_dl/extractor/ooyala.py31
-rw-r--r--youtube_dl/extractor/ora.py75
-rw-r--r--youtube_dl/extractor/orf.py16
-rw-r--r--youtube_dl/extractor/pandoratv.py78
-rw-r--r--youtube_dl/extractor/pluralsight.py2
-rw-r--r--youtube_dl/extractor/prosiebensat1.py30
-rw-r--r--youtube_dl/extractor/qqmusic.py35
-rw-r--r--youtube_dl/extractor/rai.py7
-rw-r--r--youtube_dl/extractor/regiotv.py62
-rw-r--r--youtube_dl/extractor/revision3.py127
-rw-r--r--youtube_dl/extractor/rte.py84
-rw-r--r--youtube_dl/extractor/rts.py2
-rw-r--r--youtube_dl/extractor/ruutu.py5
-rw-r--r--youtube_dl/extractor/shahid.py3
-rw-r--r--youtube_dl/extractor/soundcloud.py38
-rw-r--r--youtube_dl/extractor/srgssr.py34
-rw-r--r--youtube_dl/extractor/tele13.py21
-rw-r--r--youtube_dl/extractor/testtube.py90
-rw-r--r--youtube_dl/extractor/testurl.py2
-rw-r--r--youtube_dl/extractor/theplatform.py2
-rw-r--r--youtube_dl/extractor/tlc.py23
-rw-r--r--youtube_dl/extractor/tudou.py94
-rw-r--r--youtube_dl/extractor/tv4.py2
-rw-r--r--youtube_dl/extractor/tvland.py64
-rw-r--r--youtube_dl/extractor/twentymin.py73
-rw-r--r--youtube_dl/extractor/udemy.py52
-rw-r--r--youtube_dl/extractor/ultimedia.py116
-rw-r--r--youtube_dl/extractor/unistra.py2
-rw-r--r--youtube_dl/extractor/ustream.py2
-rw-r--r--youtube_dl/extractor/videomega.py1
-rw-r--r--youtube_dl/extractor/videomore.py243
-rw-r--r--youtube_dl/extractor/videott.py1
-rw-r--r--youtube_dl/extractor/vimeo.py48
-rw-r--r--youtube_dl/extractor/vodlocker.py26
-rw-r--r--youtube_dl/extractor/vrt.py10
-rw-r--r--youtube_dl/extractor/wdr.py35
-rw-r--r--youtube_dl/extractor/xhamster.py35
-rw-r--r--youtube_dl/extractor/xtube.py72
-rw-r--r--youtube_dl/extractor/yahoo.py34
-rw-r--r--youtube_dl/extractor/youku.py10
-rw-r--r--youtube_dl/extractor/youtube.py19
-rw-r--r--youtube_dl/extractor/zdf.py15
-rw-r--r--youtube_dl/extractor/zippcast.py94
87 files changed, 2590 insertions, 793 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 9dcd252f8..aeb67354e 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -15,6 +15,7 @@ from .adobetv import (
AdobeTVVideoIE,
)
from .adultswim import AdultSwimIE
+from .aenetworks import AENetworksIE
from .aftonbladet import AftonbladetIE
from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
@@ -61,6 +62,7 @@ from .beeg import BeegIE
from .behindkink import BehindKinkIE
from .beatportpro import BeatportProIE
from .bet import BetIE
+from .bigflix import BigflixIE
from .bild import BildIE
from .bilibili import BiliBiliIE
from .bleacherreport import (
@@ -85,6 +87,7 @@ from .camdemy import (
)
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
+from .canvas import CanvasIE
from .cbs import CBSIE
from .cbsnews import CBSNewsIE
from .cbssports import CBSSportsIE
@@ -127,6 +130,8 @@ from .crunchyroll import (
)
from .cspan import CSpanIE
from .ctsnews import CtsNewsIE
+from .cultureunplugged import CultureUnpluggedIE
+from .cwtv import CWTVIE
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
@@ -203,6 +208,7 @@ from .flickr import FlickrIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
from .fourtube import FourTubeIE
+from .fox import FOXIE
from .foxgay import FoxgayIE
from .foxnews import FoxNewsIE
from .foxsports import FoxSportsIE
@@ -260,7 +266,6 @@ from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .historicfilms import HistoricFilmsIE
-from .history import HistoryIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hornbunny import HornBunnyIE
from .hotnewhiphop import HotNewHipHopIE
@@ -298,11 +303,11 @@ from .ivi import (
IviIE,
IviCompilationIE
)
+from .ivideon import IvideonIE
from .izlesene import IzleseneIE
from .jadorecettepub import JadoreCettePubIE
from .jeuxvideo import JeuxVideoIE
from .jove import JoveIE
-from .jukebox import JukeboxIE
from .jwplatform import JWPlatformIE
from .jpopsukitv import JpopsukiIE
from .kaltura import KalturaIE
@@ -435,6 +440,7 @@ from .nextmedia import (
NextMediaActionNewsIE,
AppleDailyIE,
)
+from .nextmovie import NextMovieIE
from .nfb import NFBIE
from .nfl import NFLIE
from .nhl import (
@@ -442,6 +448,7 @@ from .nhl import (
NHLNewsIE,
NHLVideocenterIE,
)
+from .nick import NickIE
from .niconico import NiconicoIE, NiconicoPlaylistIE
from .ninegag import NineGagIE
from .noco import NocoIE
@@ -472,6 +479,7 @@ from .npo import (
VPROIE,
WNLIE
)
+from .npr import NprIE
from .nrk import (
NRKIE,
NRKPlaylistIE,
@@ -491,12 +499,14 @@ from .ooyala import (
OoyalaIE,
OoyalaExternalIE,
)
+from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
ORFOE1IE,
ORFFM4IE,
ORFIPTVIE,
)
+from .pandoratv import PandoraTVIE
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
@@ -551,14 +561,16 @@ from .rai import (
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redtube import RedTubeIE
+from .regiotv import RegioTVIE
from .restudy import RestudyIE
from .reverbnation import ReverbNationIE
+from .revision3 import Revision3IE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtbf import RTBFIE
-from .rte import RteIE
+from .rte import RteIE, RteRadioIE
from .rtlnl import RtlNlIE
from .rtl2 import RTL2IE
from .rtp import RTPIE
@@ -682,7 +694,6 @@ from .telemb import TeleMBIE
from .teletask import TeleTaskIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
-from .testtube import TestTubeIE
from .tf1 import TF1IE
from .theintercept import TheInterceptIE
from .theonion import TheOnionIE
@@ -694,7 +705,7 @@ from .thesixtyone import TheSixtyOneIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
-from .tlc import TlcIE, TlcDeIE
+from .tlc import TlcDeIE
from .tmz import (
TMZIE,
TMZArticleIE,
@@ -716,7 +727,11 @@ from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE
from .tubitv import TubiTvIE
-from .tudou import TudouIE
+from .tudou import (
+ TudouIE,
+ TudouPlaylistIE,
+ TudouAlbumIE,
+)
from .tumblr import TumblrIE
from .tunein import (
TuneInClipIE,
@@ -737,10 +752,12 @@ from .tvc import (
TVCArticleIE,
)
from .tvigle import TvigleIE
+from .tvland import TVLandIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
from .tweakers import TweakersIE
from .twentyfourvideo import TwentyFourVideoIE
+from .twentymin import TwentyMinutenIE
from .twentytwotracks import (
TwentyTwoTracksIE,
TwentyTwoTracksGenreIE
@@ -783,6 +800,11 @@ from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videomega import VideoMegaIE
+from .videomore import (
+ VideomoreIE,
+ VideomoreVideoIE,
+ VideomoreSeasonIE,
+)
from .videopremium import VideoPremiumIE
from .videott import VideoTtIE
from .vidme import VidmeIE
@@ -896,6 +918,7 @@ from .zingmp3 import (
ZingMp3SongIE,
ZingMp3AlbumIE,
)
+from .zippcast import ZippCastIE
_ALL_CLASSES = [
klass
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index bf21a6887..8157da2cb 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -187,7 +187,8 @@ class AdultSwimIE(InfoExtractor):
media_url = file_el.text
if determine_ext(media_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- media_url, segment_title, 'mp4', preference=0, m3u8_id='hls'))
+ media_url, segment_title, 'mp4', preference=0,
+ m3u8_id='hls', fatal=False))
else:
formats.append({
'format_id': '%s_%s' % (bitrate, ftype),
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
new file mode 100644
index 000000000..43d7b0523
--- /dev/null
+++ b/youtube_dl/extractor/aenetworks.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class AENetworksIE(InfoExtractor):
+ IE_NAME = 'aenetworks'
+ IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
+
+ _TESTS = [{
+ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
+ 'info_dict': {
+ 'id': 'g12m5Gyt3fdR',
+ 'ext': 'mp4',
+ 'title': "Bet You Didn't Know: Valentine's Day",
+ 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'add_ie': ['ThePlatform'],
+ 'expected_warnings': ['JSON-LD'],
+ }, {
+ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
+ 'info_dict': {
+ 'id': 'eg47EERs_JsZ',
+ 'ext': 'mp4',
+ 'title': "Winter Is Coming",
+ 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'add_ie': ['ThePlatform'],
+ }, {
+ 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry',
+ 'only_matching': True
+ }, {
+ 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage',
+ 'only_matching': True
+ }, {
+ 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url_re = [
+ r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
+ r"media_url\s*=\s*'([^']+)'"
+ ]
+ video_url = self._search_regex(video_url_re, webpage, 'video url')
+
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+ info.update({
+ '_type': 'url_transparent',
+ 'url': smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}),
+ })
+ return info
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index 1035d1c48..69e6baff7 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -76,5 +76,6 @@ class AMPIE(InfoExtractor):
'thumbnails': thumbnails,
'timestamp': parse_iso8601(item.get('pubDate'), ' '),
'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
+ 'subtitles': subtitles,
'formats': formats,
}
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index 3fb042cea..b8f9ae005 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -132,11 +132,6 @@ class AtresPlayerIE(InfoExtractor):
})
formats.append(format_info)
- m3u8_url = player.get('urlVideoHls')
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
-
timestamp = int_or_none(self._download_webpage(
self._TIME_API_URL,
video_id, 'Downloading timestamp', fatal=False), 1000, time.time())
diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py
index 9b037bb0c..3b2effa15 100644
--- a/youtube_dl/extractor/audimedia.py
+++ b/youtube_dl/extractor/audimedia.py
@@ -45,11 +45,15 @@ class AudiMediaIE(InfoExtractor):
stream_url_hls = json_data.get('stream_url_hls')
if stream_url_hls:
- formats.extend(self._extract_m3u8_formats(stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ stream_url_hls, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
stream_url_hds = json_data.get('stream_url_hds')
if stream_url_hds:
- formats.extend(self._extract_f4m_formats(json_data.get('stream_url_hds') + '?hdcore=3.4.0', video_id, -1, f4m_id='hds', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ stream_url_hds + '?hdcore=3.4.0',
+ video_id, f4m_id='hds', fatal=False))
for video_version in json_data.get('video_versions'):
video_version_url = video_version.get('download_url') or video_version.get('stream_url')
diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py
index e37ee4440..76b21e596 100644
--- a/youtube_dl/extractor/baidu.py
+++ b/youtube_dl/extractor/baidu.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..utils import unescapeHTML
class BaiduVideoIE(InfoExtractor):
@@ -14,8 +14,8 @@ class BaiduVideoIE(InfoExtractor):
'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
'info_dict': {
'id': '1069',
- 'title': '中华小当家 TV版 (全52集)',
- 'description': 'md5:395a419e41215e531c857bb037bbaf80',
+ 'title': '中华小当家 TV版国语',
+ 'description': 'md5:51be07afe461cf99fa61231421b5397c',
},
'playlist_count': 52,
}, {
@@ -25,45 +25,32 @@ class BaiduVideoIE(InfoExtractor):
'title': 're:^奔跑吧兄弟',
'description': 'md5:1bf88bad6d850930f542d51547c089b8',
},
- 'playlist_mincount': 3,
+ 'playlist_mincount': 12,
}]
+ def _call_api(self, path, category, playlist_id, note):
+ return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % (
+ path, category, playlist_id), playlist_id, note)
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
- category = category2 = mobj.group('type')
+ category, playlist_id = re.match(self._VALID_URL, url).groups()
if category == 'show':
- category2 = 'tvshow'
-
- webpage = self._download_webpage(url, playlist_id)
-
- playlist_title = self._html_search_regex(
- r'title\s*:\s*(["\'])(?P<title>[^\']+)\1', webpage,
- 'playlist title', group='title')
- playlist_description = self._html_search_regex(
- r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage,
- playlist_id, 'playlist description')
+ category = 'tvshow'
+ if category == 'tv':
+ category = 'tvplay'
- site = self._html_search_regex(
- r'filterSite\s*:\s*["\']([^"]*)["\']', webpage,
- 'primary provider site')
- api_result = self._download_json(
- 'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % (
- category, category2, playlist_id, site),
- playlist_id, 'Get playlist links')
+ playlist_detail = self._call_api(
+ 'xqinfo', category, playlist_id, 'Download playlist JSON metadata')
- entries = []
- for episode in api_result[0]['episodes']:
- episode_id = '%s_%s' % (playlist_id, episode['episode'])
+ playlist_title = playlist_detail['title']
+ playlist_description = unescapeHTML(playlist_detail.get('intro'))
- redirect_page = self._download_webpage(
- compat_urlparse.urljoin(url, episode['url']), episode_id,
- note='Download Baidu redirect page')
- real_url = self._html_search_regex(
- r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL')
+ episodes_detail = self._call_api(
+ 'xqsingle', category, playlist_id, 'Download episodes JSON metadata')
- entries.append(self.url_result(
- real_url, video_title=episode['single_title']))
+ entries = [self.url_result(
+ episode['url'], video_title=episode['title']
+ ) for episode in episodes_detail['videos']]
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 923273fb2..1c493b72d 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -23,7 +23,17 @@ class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
_ID_REGEX = r'[pb][\da-z]{7}'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>%s)' % _ID_REGEX
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?bbc\.co\.uk/
+ (?:
+ programmes/(?!articles/)|
+ iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
+ music/clips[/#]|
+ radio/player/
+ )
+ (?P<id>%s)
+ ''' % _ID_REGEX
_MEDIASELECTOR_URLS = [
# Provides HQ HLS streams with even better quality that pc mediaset but fails
@@ -114,14 +124,14 @@ class BBCCoUkIE(InfoExtractor):
},
'skip': 'Episode is no longer available on BBC iPlayer Radio',
}, {
- 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
+ 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
'note': 'Audio',
'info_dict': {
- 'id': 'p02frcch',
+ 'id': 'p022h44j',
'ext': 'flv',
- 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
- 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
- 'duration': 3507,
+ 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
+ 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
+ 'duration': 227,
},
'params': {
# rtmp download
@@ -172,13 +182,12 @@ class BBCCoUkIE(InfoExtractor):
}, {
# iptv-all mediaset fails with geolocation however there is no geo restriction
# for this programme at all
- 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf',
+ 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
'info_dict': {
- 'id': 'b06bp7kf',
+ 'id': 'b06rkms3',
'ext': 'flv',
- 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie",
- 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.',
- 'duration': 10800,
+ 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
+ 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
},
'params': {
# rtmp download
@@ -193,6 +202,9 @@ class BBCCoUkIE(InfoExtractor):
}, {
'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
+ 'only_matching': True,
}
]
@@ -469,7 +481,8 @@ class BBCCoUkIE(InfoExtractor):
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
- title = self._og_search_title(webpage)
+ title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+ r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', webpage, 'title')
description = self._search_regex(
r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
webpage, 'description', default=None)
@@ -705,19 +718,10 @@ class BBCIE(BBCCoUkIE):
webpage = self._download_webpage(url, playlist_id)
- timestamp = None
- playlist_title = None
- playlist_description = None
-
- ld = self._parse_json(
- self._search_regex(
- r'(?s)<script type="application/ld\+json">(.+?)</script>',
- webpage, 'ld json', default='{}'),
- playlist_id, fatal=False)
- if ld:
- timestamp = parse_iso8601(ld.get('datePublished'))
- playlist_title = ld.get('headline')
- playlist_description = ld.get('articleBody')
+ json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
+ timestamp = json_ld_info.get('timestamp')
+ playlist_title = json_ld_info.get('title')
+ playlist_description = json_ld_info.get('description')
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index c8d921daf..34c2a756f 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -34,7 +34,7 @@ class BeegIE(InfoExtractor):
video_id = self._match_id(url)
video = self._download_json(
- 'http://beeg.com/api/v5/video/%s' % video_id, video_id)
+ 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id)
def split(o, e):
def cut(s, x):
@@ -60,7 +60,7 @@ class BeegIE(InfoExtractor):
def decrypt_url(encrypted_url):
encrypted_url = self._proto_relative_url(
- encrypted_url.replace('{DATA_MARKERS}', ''), 'http:')
+ encrypted_url.replace('{DATA_MARKERS}', ''), 'https:')
key = self._search_regex(
r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None)
if not key:
diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py
new file mode 100644
index 000000000..33762ad93
--- /dev/null
+++ b/youtube_dl/extractor/bigflix.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class BigflixIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537',
+ 'md5': 'ec76aa9b1129e2e5b301a474e54fab74',
+ 'info_dict': {
+ 'id': '16537',
+ 'ext': 'mp4',
+ 'title': 'Singham Returns',
+ 'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d',
+ }
+ }, {
+ # 2 formats
+ 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070',
+ 'info_dict': {
+ 'id': '16070',
+ 'ext': 'mp4',
+ 'title': 'Madarasapatinam',
+ 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca',
+ 'formats': 'mincount:2',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # multiple formats
+ 'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>',
+ webpage, 'title')
+
+ def decode_url(quoted_b64_url):
+ return base64.b64decode(compat_urllib_parse_unquote(
+ quoted_b64_url).encode('ascii')).decode('utf-8')
+
+ formats = []
+ for height, encoded_url in re.findall(
+ r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage):
+ video_url = decode_url(encoded_url)
+ f = {
+ 'url': video_url,
+ 'format_id': '%sp' % height,
+ 'height': int(height),
+ }
+ if video_url.startswith('rtmp'):
+ f['ext'] = 'flv'
+ formats.append(f)
+
+ file_url = self._search_regex(
+ r'file=([^&]+)', webpage, 'video url', default=None)
+ if file_url:
+ video_url = decode_url(file_url)
+ if all(f['url'] != video_url for f in formats):
+ formats.append({
+ 'url': decode_url(file_url),
+ })
+
+ self._sort_formats(formats)
+
+ description = self._html_search_meta('description', webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index f6a1ff381..f1f128c45 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -9,9 +9,9 @@ from ..utils import parse_duration
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
- _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.canalc2.tv/video/12163',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
@@ -23,24 +23,36 @@ class Canalc2IE(InfoExtractor):
'params': {
'skip_download': True, # Requires rtmpdump
}
- }
+ }, {
+ 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(
- r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P<file>.+?)\2',
- webpage, 'video_url', group='file')
- formats = [{'url': video_url}]
- if video_url.startswith('rtmp://'):
- rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
- formats[0].update({
- 'url': rtmp.group('url'),
- 'ext': 'flv',
- 'app': rtmp.group('app'),
- 'play_path': rtmp.group('play_path'),
- 'page_url': url,
- })
+
+ webpage = self._download_webpage(
+ 'http://www.canalc2.tv/video/%s' % video_id, video_id)
+
+ formats = []
+ for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage):
+ if video_url.startswith('rtmp://'):
+ rtmp = re.search(
+ r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
+ formats.append({
+ 'url': rtmp.group('url'),
+ 'format_id': 'rtmp',
+ 'ext': 'flv',
+ 'app': rtmp.group('app'),
+ 'play_path': rtmp.group('play_path'),
+ 'page_url': url,
+ })
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'http',
+ })
+ self._sort_formats(formats)
title = self._html_search_regex(
r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title')
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 004372f8d..25b2d4efe 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -10,13 +10,14 @@ from ..utils import (
unified_strdate,
url_basename,
qualities,
+ int_or_none,
)
class CanalplusIE(InfoExtractor):
IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv'
_VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
- _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'
_SITE_ID_MAP = {
'canalplus.fr': 'cplus',
'piwiplus.fr': 'teletoon',
@@ -26,10 +27,10 @@ class CanalplusIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
- 'md5': 'b3481d7ca972f61e37420798d0a9d934',
+ 'md5': '12164a6f14ff6df8bd628e8ba9b10b78',
'info_dict': {
'id': '1263092',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Le Zapping - 13/05/15',
'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
'upload_date': '20150513',
@@ -56,10 +57,10 @@ class CanalplusIE(InfoExtractor):
'skip': 'videos get deleted after a while',
}, {
'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
- 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4',
+ 'md5': '38b8f7934def74f0d6f3ba6c036a5f82',
'info_dict': {
'id': '1213714',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45',
'description': 'md5:8216206ec53426ea6321321f3b3c16db',
'upload_date': '20150211',
@@ -82,15 +83,16 @@ class CanalplusIE(InfoExtractor):
webpage, 'video id', group='id')
info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
- doc = self._download_xml(info_url, video_id, 'Downloading video XML')
+ video_data = self._download_json(info_url, video_id, 'Downloading video JSON')
- video_info = [video for video in doc if video.find('ID').text == video_id][0]
- media = video_info.find('MEDIA')
- infos = video_info.find('INFOS')
+ if isinstance(video_data, list):
+ video_data = [video for video in video_data if video.get('ID') == video_id][0]
+ media = video_data['MEDIA']
+ infos = video_data['INFOS']
- preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS'])
+ preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD'])
- fmt_url = next(iter(media.find('VIDEOS'))).text
+ fmt_url = next(iter(media.get('VIDEOS')))
if '/geo' in fmt_url.lower():
response = self._request_webpage(
HEADRequest(fmt_url), video_id,
@@ -101,35 +103,42 @@ class CanalplusIE(InfoExtractor):
expected=True)
formats = []
- for fmt in media.find('VIDEOS'):
- format_url = fmt.text
+ for format_id, format_url in media['VIDEOS'].items():
if not format_url:
continue
- format_id = fmt.tag
if format_id == 'HLS':
formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', preference=preference(format_id)))
+ format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False))
elif format_id == 'HDS':
formats.extend(self._extract_f4m_formats(
- format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id)))
+ format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False))
else:
formats.append({
- 'url': format_url,
+ # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js
+ 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes',
'format_id': format_id,
'preference': preference(format_id),
})
self._sort_formats(formats)
+ thumbnails = [{
+ 'id': image_id,
+ 'url': image_url,
+ } for image_id, image_url in media.get('images', {}).items()]
+
+ titrage = infos['TITRAGE']
+
return {
'id': video_id,
'display_id': display_id,
- 'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text,
- infos.find('TITRAGE/SOUS_TITRE').text),
- 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
- 'thumbnail': media.find('IMAGES/GRAND').text,
- 'description': infos.find('DESCRIPTION').text,
- 'view_count': int(infos.find('NB_VUES').text),
- 'like_count': int(infos.find('NB_LIKES').text),
- 'comment_count': int(infos.find('NB_COMMENTS').text),
+ 'title': '%s - %s' % (titrage['TITRE'],
+ titrage['SOUS_TITRE']),
+ 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')),
+ 'thumbnails': thumbnails,
+ 'description': infos.get('DESCRIPTION'),
+ 'duration': int_or_none(infos.get('DURATION')),
+ 'view_count': int_or_none(infos.get('NB_VUES')),
+ 'like_count': int_or_none(infos.get('NB_LIKES')),
+ 'comment_count': int_or_none(infos.get('NB_COMMENTS')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py
new file mode 100644
index 000000000..ee19ff836
--- /dev/null
+++ b/youtube_dl/extractor/canvas.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class CanvasIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
+ 'md5': 'ea838375a547ac787d4064d8c7860a6c',
+ 'info_dict': {
+ 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
+ 'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
+ 'ext': 'mp4',
+ 'title': 'De afspraak veilt voor de Warmste Week',
+ 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 49.02,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._search_regex(
+ r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
+ webpage, 'title', default=None) or self._og_search_title(webpage)
+
+ video_id = self._html_search_regex(
+ r'data-video=(["\'])(?P<id>.+?)\1', webpage, 'video id', group='id')
+
+ data = self._download_json(
+ 'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id)
+
+ formats = []
+ for target in data['targetUrls']:
+ format_url, format_type = target.get('url'), target.get('type')
+ if not format_url or not format_type:
+ continue
+ if format_type == 'HLS':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, entry_protocol='m3u8_native',
+ ext='mp4', preference=0, fatal=False, m3u8_id=format_type))
+ elif format_type == 'HDS':
+ formats.extend(self._extract_f4m_formats(
+ format_url, display_id, f4m_id=format_type, fatal=False))
+ else:
+ formats.append({
+ 'format_id': format_type,
+ 'url': format_url,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'formats': formats,
+ 'duration': float_or_none(data.get('duration'), 1000),
+ 'thumbnail': data.get('posterImageUrl'),
+ }
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index f9a64a0a2..d211ec23b 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -5,6 +5,7 @@ import re
import json
from .common import InfoExtractor
+from ..utils import remove_start
class CBSNewsIE(InfoExtractor):
@@ -62,6 +63,7 @@ class CBSNewsIE(InfoExtractor):
uri = item.get('media' + format_id + 'URI')
if not uri:
continue
+ uri = remove_start(uri, '{manifest:none}')
fmt = {
'url': uri,
'format_id': format_id,
@@ -70,6 +72,8 @@ class CBSNewsIE(InfoExtractor):
play_path = re.sub(
r'{slistFilePath}', '',
uri.split('<break>')[-1].split('{break}')[-1])
+ play_path = re.sub(
+ r'{manifest:.+}.*$', '', play_path)
fmt.update({
'app': 'ondemand?auth=cbs',
'play_path': 'mp4:' + play_path,
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py
index 6924eac70..e94b1e35b 100644
--- a/youtube_dl/extractor/ccc.py
+++ b/youtube_dl/extractor/ccc.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ parse_duration,
qualities,
unified_strdate,
)
@@ -12,21 +13,25 @@ from ..utils import (
class CCCIE(InfoExtractor):
IE_NAME = 'media.ccc.de'
- _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/[^?#]+/[^?#/]*?_(?P<id>[0-9]{8,})._[^?#/]*\.html'
+ _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P<id>[^/?#&]+)'
- _TEST = {
- 'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
+ _TESTS = [{
+ 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video',
'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': {
- 'id': '20131228183',
+ 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor',
'ext': 'mp4',
'title': 'Introduction to Processor Design',
- 'description': 'md5:5ddbf8c734800267f2cee4eab187bc1b',
+ 'description': 'md5:80be298773966f66d56cb11260b879af',
'thumbnail': 're:^https?://.*\.jpg$',
'view_count': int,
- 'upload_date': '20131229',
+ 'upload_date': '20131228',
+ 'duration': 3660,
}
- }
+ }, {
+ 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -40,21 +45,25 @@ class CCCIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
- r"(?s)<p class='description'>(.*?)</p>",
+ r"(?s)<h3>About</h3>(.+?)<h3>",
webpage, 'description', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
- r"(?s)<span class='[^']*fa-calendar-o'></span>(.*?)</li>",
+ r"(?s)<span[^>]+class='[^']*fa-calendar-o'[^>]*>(.+?)</span>",
webpage, 'upload date', fatal=False))
view_count = int_or_none(self._html_search_regex(
r"(?s)<span class='[^']*fa-eye'></span>(.*?)</li>",
webpage, 'view count', fatal=False))
+ duration = parse_duration(self._html_search_regex(
+ r'(?s)<span[^>]+class=(["\']).*?fa-clock-o.*?\1[^>]*></span>(?P<duration>.+?)</li',
+ webpage, 'duration', fatal=False, group='duration'))
matches = re.finditer(r'''(?xs)
- <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
+ <(?:span|div)\s+class='label\s+filetype'>(?P<format>[^<]*)</(?:span|div)>\s*
+ <(?:span|div)\s+class='label\s+filetype'>(?P<lang>[^<]*)</(?:span|div)>\s*
<a\s+download\s+href='(?P<http_url>[^']+)'>\s*
(?:
.*?
- <a\s+href='(?P<torrent_url>[^']+\.torrent)'
+ <a\s+(?:download\s+)?href='(?P<torrent_url>[^']+\.torrent)'
)?''', webpage)
formats = []
for m in matches:
@@ -62,12 +71,15 @@ class CCCIE(InfoExtractor):
format_id = self._search_regex(
r'.*/([a-z0-9_-]+)/[^/]*$',
m.group('http_url'), 'format id', default=None)
+ if format_id:
+ format_id = m.group('lang') + '-' + format_id
vcodec = 'h264' if 'h264' in format_id else (
'none' if format_id in ('mp3', 'opus') else None
)
formats.append({
'format_id': format_id,
'format': format,
+ 'language': m.group('lang'),
'url': m.group('http_url'),
'vcodec': vcodec,
'preference': preference(format_id),
@@ -95,5 +107,6 @@ class CCCIE(InfoExtractor):
'thumbnail': thumbnail,
'view_count': view_count,
'upload_date': upload_date,
+ 'duration': duration,
'formats': formats,
}
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 655207447..8da70ae14 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -34,6 +34,7 @@ from ..utils import (
fix_xml_ampersands,
float_or_none,
int_or_none,
+ parse_iso8601,
RegexNotFoundError,
sanitize_filename,
sanitized_Request,
@@ -108,8 +109,9 @@ class InfoExtractor(object):
-2 or smaller for less than default.
< -1000 to hide the format (if there is
another one which is strictly better)
- * language_preference Is this in the correct requested
- language?
+ * language Language code, e.g. "de" or "en-US".
+ * language_preference Is this in the language mentioned in
+ the URL?
10 if it's what the URL is about,
-1 for default (don't know),
-10 otherwise, other values reserved for now.
@@ -200,6 +202,26 @@ class InfoExtractor(object):
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ The following fields should only be used when the video belongs to some logical
+ chapter or section:
+
+ chapter: Name or title of the chapter the video belongs to.
+ chapter_number: Number of the chapter the video belongs to, as an integer.
+ chapter_id: Id of the chapter the video belongs to, as a unicode string.
+
+ The following fields should only be used when the video is an episode of some
+ series or programme:
+
+ series: Title of the series or programme the video episode belongs to.
+ season: Title of the season the video episode belongs to.
+ season_number: Number of the season the video episode belongs to, as an integer.
+ season_id: Id of the season the video episode belongs to, as a unicode string.
+ episode: Title of the video episode. Unlike mandatory video title field,
+ this field should denote the exact title of the video episode
+ without any kind of decoration.
+ episode_number: Number of the video episode within a season, as an integer.
+ episode_id: Id of the video episode, as a unicode string.
+
Unless mentioned otherwise, the fields should be Unicode strings.
Unless mentioned otherwise, None is equivalent to absence of information.
@@ -292,9 +314,9 @@ class InfoExtractor(object):
except ExtractorError:
raise
except compat_http_client.IncompleteRead as e:
- raise ExtractorError('A network error has occured.', cause=e, expected=True)
+ raise ExtractorError('A network error has occurred.', cause=e, expected=True)
except (KeyError, StopIteration) as e:
- raise ExtractorError('An extractor error has occured.', cause=e)
+ raise ExtractorError('An extractor error has occurred.', cause=e)
def set_downloader(self, downloader):
"""Sets the downloader for this IE."""
@@ -741,6 +763,42 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
+ def _search_json_ld(self, html, video_id, **kwargs):
+ json_ld = self._search_regex(
+ r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
+ html, 'JSON-LD', group='json_ld', **kwargs)
+ if not json_ld:
+ return {}
+ return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+
+ def _json_ld(self, json_ld, video_id, fatal=True):
+ if isinstance(json_ld, compat_str):
+ json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
+ if not json_ld:
+ return {}
+ info = {}
+ if json_ld.get('@context') == 'http://schema.org':
+ item_type = json_ld.get('@type')
+ if item_type == 'TVEpisode':
+ info.update({
+ 'episode': unescapeHTML(json_ld.get('name')),
+ 'episode_number': int_or_none(json_ld.get('episodeNumber')),
+ 'description': unescapeHTML(json_ld.get('description')),
+ })
+ part_of_season = json_ld.get('partOfSeason')
+ if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
+ info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+ part_of_series = json_ld.get('partOfSeries')
+ if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+ info['series'] = unescapeHTML(part_of_series.get('name'))
+ elif item_type == 'Article':
+ info.update({
+ 'timestamp': parse_iso8601(json_ld.get('datePublished')),
+ 'title': unescapeHTML(json_ld.get('headline')),
+ 'description': unescapeHTML(json_ld.get('articleBody')),
+ })
+ return dict((k, v) for k, v in info.items() if v is not None)
+
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 00d943f77..785594df8 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -329,8 +329,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
streamdata_req, video_id,
note='Downloading media info for %s' % video_format)
stream_info = streamdata.find('./{default}preload/stream_info')
- video_url = stream_info.find('./host').text
- video_play_path = stream_info.find('./file').text
+ video_url = xpath_text(stream_info, './host')
+ video_play_path = xpath_text(stream_info, './file')
+ if not video_url or not video_play_path:
+ continue
metadata = stream_info.find('./metadata')
format_info = {
'format': video_format,
diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py
new file mode 100644
index 000000000..9c764fe68
--- /dev/null
+++ b/youtube_dl/extractor/cultureunplugged.py
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class CultureUnpluggedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?'
+ _TESTS = [{
+ 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West',
+ 'md5': 'ac6c093b089f7d05e79934dcb3d228fc',
+ 'info_dict': {
+ 'id': '53662',
+ 'display_id': 'The-Next--Best-West',
+ 'ext': 'mp4',
+ 'title': 'The Next, Best West',
+ 'description': 'md5:0423cd00833dea1519cf014e9d0903b1',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'creator': 'Coldstream Creative',
+ 'duration': 2203,
+ 'view_count': int,
+ }
+ }, {
+ 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ movie_data = self._download_json(
+ 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id)
+
+ video_url = movie_data['url']
+ title = movie_data['title']
+
+ description = movie_data.get('synopsis')
+ creator = movie_data.get('producer')
+ duration = int_or_none(movie_data.get('duration'))
+ view_count = int_or_none(movie_data.get('views'))
+
+ thumbnails = [{
+ 'url': movie_data['%s_thumb' % size],
+ 'id': size,
+ 'preference': preference,
+ } for preference, size in enumerate((
+ 'small', 'large')) if movie_data.get('%s_thumb' % size)]
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'creator': creator,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py
new file mode 100644
index 000000000..36af67013
--- /dev/null
+++ b/youtube_dl/extractor/cwtv.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class CWTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
+ _TESTS = [{
+ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
+ 'info_dict': {
+ 'id': '6b15e985-9345-4f60-baf8-56e96be57c63',
+ 'ext': 'mp4',
+ 'title': 'Legends of Yesterday',
+ 'description': 'Oliver and Barry Allen take Kendra Saunders and Carter Hall to a remote location to keep them hidden from Vandal Savage while they figure out how to defeat him.',
+ 'duration': 2665,
+ 'series': 'Arrow',
+ 'season_number': 4,
+ 'season': '4',
+ 'episode_number': 8,
+ 'upload_date': '20151203',
+ 'timestamp': 1449122100,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088',
+ 'info_dict': {
+ 'id': '24282b12-ead2-42f2-95ad-26770c2c6088',
+ 'ext': 'mp4',
+ 'title': 'Jeff Davis 4',
+ 'description': 'Jeff Davis is back to make you laugh.',
+ 'duration': 1263,
+ 'series': 'Whose Line Is It Anyway?',
+ 'season_number': 11,
+ 'season': '11',
+ 'episode_number': 20,
+ 'upload_date': '20151006',
+ 'timestamp': 1444107300,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_data = self._download_json(
+ 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id)
+
+ formats = self._extract_m3u8_formats(
+ video_data['videos']['variantplaylist']['uri'], video_id, 'mp4')
+
+ thumbnails = [{
+ 'url': image['uri'],
+ 'width': image.get('width'),
+ 'height': image.get('height'),
+ } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None
+
+ video_metadata = video_data['assetFields']
+
+ subtitles = {
+ 'en': [{
+ 'url': video_metadata['UnicornCcUrl'],
+ }],
+ } if video_metadata.get('UnicornCcUrl') else None
+
+ return {
+ 'id': video_id,
+ 'title': video_metadata['title'],
+ 'description': video_metadata.get('description'),
+ 'duration': int_or_none(video_metadata.get('duration')),
+ 'series': video_metadata.get('seriesName'),
+ 'season_number': int_or_none(video_metadata.get('seasonNumber')),
+ 'season': video_metadata.get('seasonName'),
+ 'episode_number': int_or_none(video_metadata.get('episodeNumber')),
+ 'timestamp': parse_iso8601(video_data.get('startTime')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 439fd42e8..6e462af69 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -37,7 +37,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor):
- _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
+ _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion'
_FORMATS = [
@@ -104,6 +104,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
{
'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
'only_matching': True,
+ },
+ {
+ 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
+ 'only_matching': True,
}
]
@@ -149,14 +153,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
ext = determine_ext(media_url)
if type_ == 'application/x-mpegURL' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ media_url, video_id, 'mp4', preference=-1,
+ m3u8_id='hls', fatal=False))
elif type_ == 'application/f4m' or ext == 'f4m':
formats.extend(self._extract_f4m_formats(
media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
else:
f = {
'url': media_url,
- 'format_id': quality,
+ 'format_id': 'http-%s' % quality,
}
m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
if m:
@@ -335,7 +340,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = 'dailymotion:user'
- _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py
index 8f48571de..15a1c40f7 100644
--- a/youtube_dl/extractor/dcn.py
+++ b/youtube_dl/extractor/dcn.py
@@ -5,7 +5,10 @@ import re
import base64
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import (
+ compat_urllib_parse,
+ compat_str,
+)
from ..utils import (
int_or_none,
parse_iso8601,
@@ -186,7 +189,8 @@ class DCNSeasonIE(InfoExtractor):
entries = []
for video in show['videos']:
+ video_id = compat_str(video['id'])
entries.append(self.url_result(
- 'http://www.dcndigital.ae/media/%s' % video['id'], 'DCNVideo'))
+ 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo', video_id))
return self.playlist_result(entries, season_id, title)
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index d6723ecf2..ce680a9f3 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -9,7 +9,17 @@ from ..compat import compat_str
class DiscoveryIE(InfoExtractor):
- _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
+ _VALID_URL = r'''(?x)http://(?:www\.)?(?:
+ discovery|
+ investigationdiscovery|
+ discoverylife|
+ animalplanet|
+ ahctv|
+ destinationamerica|
+ sciencechannel|
+ tlc|
+ velocity
+ )\.com/(?:[^/]+/)*(?P<id>[^./?#]+)'''
_TESTS = [{
'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
'info_dict': {
@@ -21,8 +31,8 @@ class DiscoveryIE(InfoExtractor):
'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'
' back.'),
'duration': 156,
- 'timestamp': 1303099200,
- 'upload_date': '20110418',
+ 'timestamp': 1302032462,
+ 'upload_date': '20110405',
},
'params': {
'skip_download': True, # requires ffmpeg
@@ -33,27 +43,38 @@ class DiscoveryIE(InfoExtractor):
'id': 'mythbusters-the-simpsons',
'title': 'MythBusters: The Simpsons',
},
- 'playlist_count': 9,
+ 'playlist_mincount': 10,
+ }, {
+ 'url': 'http://www.animalplanet.com/longfin-eels-maneaters/',
+ 'info_dict': {
+ 'id': '78326',
+ 'ext': 'mp4',
+ 'title': 'Longfin Eels: Maneaters?',
+ 'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.',
+ 'upload_date': '20140725',
+ 'timestamp': 1406246400,
+ 'duration': 116,
+ },
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- info = self._download_json(url + '?flat=1', video_id)
+ display_id = self._match_id(url)
+ info = self._download_json(url + '?flat=1', display_id)
video_title = info.get('playlist_title') or info.get('video_title')
entries = [{
'id': compat_str(video_info['id']),
'formats': self._extract_m3u8_formats(
- video_info['src'], video_id, ext='mp4',
+ video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls',
note='Download m3u8 information for video %d' % (idx + 1)),
'title': video_info['title'],
'description': video_info.get('description'),
'duration': parse_duration(video_info.get('video_length')),
- 'webpage_url': video_info.get('href'),
+ 'webpage_url': video_info.get('href') or video_info.get('url'),
'thumbnail': video_info.get('thumbnailURL'),
'alt_title': video_info.get('secondary_title'),
'timestamp': parse_iso8601(video_info.get('publishedDate')),
} for idx, video_info in enumerate(info['playlist'])]
- return self.playlist_result(entries, video_id, video_title)
+ return self.playlist_result(entries, display_id, video_title)
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
index 60ed438f8..d35e88881 100644
--- a/youtube_dl/extractor/dramafever.py
+++ b/youtube_dl/extractor/dramafever.py
@@ -12,6 +12,7 @@ from ..compat import (
from ..utils import (
ExtractorError,
clean_html,
+ int_or_none,
sanitized_Request,
)
@@ -66,13 +67,15 @@ class DramaFeverBaseIE(AMPIE):
class DramaFeverIE(DramaFeverBaseIE):
IE_NAME = 'dramafever'
_VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
'info_dict': {
'id': '4512.1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cooking with Shin 4512.1',
'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+ 'episode': 'Episode 1',
+ 'episode_number': 1,
'thumbnail': 're:^https?://.*\.jpg',
'timestamp': 1404336058,
'upload_date': '20140702',
@@ -82,7 +85,25 @@ class DramaFeverIE(DramaFeverBaseIE):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1',
+ 'info_dict': {
+ 'id': '4826.4',
+ 'ext': 'mp4',
+ 'title': 'Mnet Asian Music Awards 2015 4826.4',
+ 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91',
+ 'episode': 'Mnet Asian Music Awards 2015 - Part 3',
+ 'episode_number': 4,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1450213200,
+ 'upload_date': '20151215',
+ 'duration': 5602,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
video_id = self._match_id(url).replace('/', '.')
@@ -105,13 +126,22 @@ class DramaFeverIE(DramaFeverBaseIE):
video_id, 'Downloading episode info JSON', fatal=False)
if episode_info:
value = episode_info.get('value')
- if value:
- subfile = value[0].get('subfile') or value[0].get('new_subfile')
- if subfile and subfile != 'http://www.dramafever.com/st/':
- info['subtitiles'].setdefault('English', []).append({
- 'ext': 'srt',
- 'url': subfile,
- })
+ if isinstance(value, list):
+ for v in value:
+ if v.get('type') == 'Episode':
+ subfile = v.get('subfile') or v.get('new_subfile')
+ if subfile and subfile != 'http://www.dramafever.com/st/':
+ info.setdefault('subtitles', {}).setdefault('English', []).append({
+ 'ext': 'srt',
+ 'url': subfile,
+ })
+ episode_number = int_or_none(v.get('number'))
+ episode_fallback = 'Episode'
+ if episode_number:
+ episode_fallback += ' %d' % episode_number
+ info['episode'] = v.get('title') or episode_fallback
+ info['episode_number'] = episode_number
+ break
return info
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index 8ac8587be..028144f20 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -2,14 +2,10 @@ from __future__ import unicode_literals
import re
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- unified_strdate,
-)
+from .zdf import ZDFIE
-class DreiSatIE(InfoExtractor):
+class DreiSatIE(ZDFIE):
IE_NAME = '3sat'
_VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TESTS = [
@@ -35,53 +31,4 @@ class DreiSatIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
- details_doc = self._download_xml(details_url, video_id, 'Downloading video details')
-
- status_code = details_doc.find('./status/statuscode')
- if status_code is not None and status_code.text != 'ok':
- code = status_code.text
- if code == 'notVisibleAnymore':
- message = 'Video %s is not available' % video_id
- else:
- message = '%s returned error: %s' % (self.IE_NAME, code)
- raise ExtractorError(message, expected=True)
-
- thumbnail_els = details_doc.findall('.//teaserimage')
- thumbnails = [{
- 'width': int(te.attrib['key'].partition('x')[0]),
- 'height': int(te.attrib['key'].partition('x')[2]),
- 'url': te.text,
- } for te in thumbnail_els]
-
- information_el = details_doc.find('.//information')
- video_title = information_el.find('./title').text
- video_description = information_el.find('./detail').text
-
- details_el = details_doc.find('.//details')
- video_uploader = details_el.find('./channel').text
- upload_date = unified_strdate(details_el.find('./airtime').text)
-
- format_els = details_doc.findall('.//formitaet')
- formats = [{
- 'format_id': fe.attrib['basetype'],
- 'width': int(fe.find('./width').text),
- 'height': int(fe.find('./height').text),
- 'url': fe.find('./url').text,
- 'filesize': int(fe.find('./filesize').text),
- 'video_bitrate': int(fe.find('./videoBitrate').text),
- } for fe in format_els
- if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
-
- self._sort_formats(formats)
-
- return {
- '_type': 'video',
- 'id': video_id,
- 'title': video_title,
- 'formats': formats,
- 'description': video_description,
- 'thumbnails': thumbnails,
- 'thumbnail': thumbnails[-1]['url'],
- 'uploader': video_uploader,
- 'upload_date': upload_date,
- }
+ return self.extract_from_xml_url(video_id, details_url)
diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py
index 5dfea0d39..f7339702c 100644
--- a/youtube_dl/extractor/einthusan.py
+++ b/youtube_dl/extractor/einthusan.py
@@ -1,9 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ remove_start,
+ sanitized_Request,
+)
class EinthusanIE(InfoExtractor):
@@ -34,27 +37,33 @@ class EinthusanIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
+ video_id = self._match_id(url)
+
+ request = sanitized_Request(url)
+ request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')
+ webpage = self._download_webpage(request, video_id)
+
+ title = self._html_search_regex(
+ r'<h1><a[^>]+class=["\']movie-title["\'][^>]*>(.+?)</a></h1>',
+ webpage, 'title')
- video_title = self._html_search_regex(
- r'<h1><a class="movie-title".*?>(.*?)</a></h1>', webpage, 'title')
+ video_id = self._search_regex(
+ r'data-movieid=["\'](\d+)', webpage, 'video id', default=video_id)
- video_url = self._html_search_regex(
- r'''(?s)jwplayer\("mediaplayer"\)\.setup\({.*?'file': '([^']+)'.*?}\);''',
- webpage, 'video url')
+ video_url = self._download_webpage(
+ 'http://cdn.einthusan.com/geturl/%s/hd/London,Washington,Toronto,Dallas,San,Sydney/'
+ % video_id, video_id)
description = self._html_search_meta('description', webpage)
thumbnail = self._html_search_regex(
r'''<a class="movie-cover-wrapper".*?><img src=["'](.*?)["'].*?/></a>''',
webpage, "thumbnail url", fatal=False)
if thumbnail is not None:
- thumbnail = thumbnail.replace('..', 'http://www.einthusan.com')
+ thumbnail = compat_urlparse.urljoin(url, remove_start(thumbnail, '..'))
return {
'id': video_id,
- 'title': video_title,
+ 'title': title,
'url': video_url,
'thumbnail': thumbnail,
'description': description,
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
index e6f8f0337..3762d8748 100644
--- a/youtube_dl/extractor/espn.py
+++ b/youtube_dl/extractor/espn.py
@@ -1,18 +1,30 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import remove_end
class ESPNIE(InfoExtractor):
_VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
- _WORKING = False
_TESTS = [{
'url': 'http://espn.go.com/video/clip?id=10365079',
'info_dict': {
'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
'ext': 'mp4',
- 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
- 'description': '',
+ 'title': '30 for 30 Shorts: Judging Jewell',
+ 'description': None,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
+ 'url': 'http://espn.go.com/video/clip?id=2743663',
+ 'info_dict': {
+ 'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg',
+ 'ext': 'mp4',
+ 'title': 'Must-See Moments: Best of the MLS season',
},
'params': {
# m3u8 download
@@ -44,12 +56,23 @@ class ESPNIE(InfoExtractor):
r'class="video-play-button"[^>]+data-id="(\d+)',
webpage, 'video id')
+ cms = 'espn'
+ if 'data-source="intl"' in webpage:
+ cms = 'intl'
+ player_url = 'https://espn.go.com/video/iframe/twitter/?id=%s&cms=%s' % (video_id, cms)
player = self._download_webpage(
- 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id)
+ player_url, video_id)
pcode = self._search_regex(
r'["\']pcode=([^"\']+)["\']', player, 'pcode')
- return self.url_result(
- 'ooyalaexternal:espn:%s:%s' % (video_id, pcode),
- 'OoyalaExternal')
+ title = remove_end(
+ self._og_search_title(webpage),
+ '- ESPN Video').strip()
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'ooyalaexternal:%s:%s:%s' % (cms, video_id, pcode),
+ 'ie_key': 'OoyalaExternal',
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 5e43f2359..ec699ba54 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -105,7 +105,7 @@ class FacebookIE(InfoExtractor):
login_results, 'login error', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
- self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+ self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
return
fb_dtsg = self._search_regex(
@@ -126,7 +126,7 @@ class FacebookIE(InfoExtractor):
check_response = self._download_webpage(check_req, None,
note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
- self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
+ self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err))
return
diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py
new file mode 100644
index 000000000..ab97b3196
--- /dev/null
+++ b/youtube_dl/extractor/fox.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class FOXIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.fox.com/watch/255180355939/7684182528',
+ 'info_dict': {
+ 'id': '255180355939',
+ 'ext': 'mp4',
+ 'title': 'Official Trailer: Gotham',
+ 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.',
+ 'duration': 129,
+ },
+ 'add_ie': ['ThePlatform'],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ release_url = self._parse_json(self._search_regex(
+ r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'),
+ video_id)['release_url'] + '&manifest=m3u'
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(release_url, {'force_smil_url': True}),
+ 'id': video_id,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 3c3066e38..0baa17e8d 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -54,8 +54,10 @@ from .snagfilms import SnagFilmsEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
from .mtv import MTVServicesEmbeddedIE
from .pladform import PladformIE
+from .videomore import VideomoreIE
from .googledrive import GoogleDriveIE
from .jwplatform import JWPlatformIE
+from .ultimedia import UltimediaIE
class GenericIE(InfoExtractor):
@@ -485,7 +487,7 @@ class GenericIE(InfoExtractor):
'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
}
},
- # Embeded Ustream video
+ # Embedded Ustream video
{
'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
'md5': '27b99cdb639c9b12a79bca876a073417',
@@ -1400,7 +1402,7 @@ class GenericIE(InfoExtractor):
# Look for embedded Dailymotion player
matches = re.findall(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
+ r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
if matches:
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
@@ -1642,7 +1644,7 @@ class GenericIE(InfoExtractor):
if myvi_url:
return self.url_result(myvi_url)
- # Look for embeded soundcloud player
+ # Look for embedded soundcloud player
mobj = re.search(
r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
webpage)
@@ -1742,6 +1744,11 @@ class GenericIE(InfoExtractor):
if pladform_url:
return self.url_result(pladform_url)
+ # Look for Videomore embeds
+ videomore_url = VideomoreIE._extract_url(webpage)
+ if videomore_url:
+ return self.url_result(videomore_url)
+
# Look for Playwire embeds
mobj = re.search(
r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
@@ -1807,6 +1814,11 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia')
+ # Look for Ulltimedia embeds
+ ultimedia_url = UltimediaIE._extract_url(webpage)
+ if ultimedia_url:
+ return self.url_result(self._proto_relative_url(ultimedia_url), 'Ultimedia')
+
# Look for AdobeTVVideo embeds
mobj = re.search(
r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
diff --git a/youtube_dl/extractor/history.py b/youtube_dl/extractor/history.py
deleted file mode 100644
index f86164afe..000000000
--- a/youtube_dl/extractor/history.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import smuggle_url
-
-
-class HistoryIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?history\.com/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
-
- _TESTS = [{
- 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
- 'md5': '6fe632d033c92aa10b8d4a9be047a7c5',
- 'info_dict': {
- 'id': 'bLx5Dv5Aka1G',
- 'ext': 'mp4',
- 'title': "Bet You Didn't Know: Valentine's Day",
- 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
- },
- 'add_ie': ['ThePlatform'],
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- video_url = self._search_regex(
- r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
- webpage, 'video url')
-
- return self.url_result(smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}))
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index d1c1c210c..c45c68c1d 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -120,19 +120,24 @@ class IGNIE(InfoExtractor):
video_id = self._find_video_id(webpage)
if not video_id:
- return self.url_result(self._search_regex(self._EMBED_RE, webpage, 'embed url'))
+ return self.url_result(self._search_regex(
+ self._EMBED_RE, webpage, 'embed url'))
return self._get_video_info(video_id)
def _get_video_info(self, video_id):
- api_data = self._download_json(self._API_URL_TEMPLATE % video_id, video_id)
+ api_data = self._download_json(
+ self._API_URL_TEMPLATE % video_id, video_id)
formats = []
m3u8_url = api_data['refs'].get('m3uUrl')
if m3u8_url:
- formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
f4m_url = api_data['refs'].get('f4mUrl')
if f4m_url:
- formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in api_data['assets']:
formats.append({
'url': asset['url'],
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 36baf3245..073777f34 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -14,6 +14,7 @@ from ..utils import (
class IPrimaIE(InfoExtractor):
+ _WORKING = False
_VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
_TESTS = [{
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 029878d24..472d72b4c 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -7,6 +7,7 @@ import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ int_or_none,
sanitized_Request,
)
@@ -27,44 +28,36 @@ class IviIE(InfoExtractor):
'title': 'Иван Васильевич меняет профессию',
'description': 'md5:b924063ea1677c8fe343d8a72ac2195f',
'duration': 5498,
- 'thumbnail': 'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg$',
},
'skip': 'Only works from Russia',
},
- # Serial's serie
+ # Serial's series
{
'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549',
'md5': '221f56b35e3ed815fde2df71032f4b3e',
'info_dict': {
'id': '9549',
'ext': 'mp4',
- 'title': 'Двое из ларца - Серия 1',
+ 'title': 'Двое из ларца - Дело Гольдберга (1 часть)',
+ 'series': 'Двое из ларца',
+ 'season': 'Сезон 1',
+ 'season_number': 1,
+ 'episode': 'Дело Гольдберга (1 часть)',
+ 'episode_number': 1,
'duration': 2655,
- 'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg$',
},
'skip': 'Only works from Russia',
}
]
# Sorted by quality
- _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ']
-
- # Sorted by size
- _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480']
-
- def _extract_description(self, html):
- m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html)
- return m.group('description') if m is not None else None
-
- def _extract_comment_count(self, html):
- m = re.search('(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html)
- return int(m.group('commentcount')) if m is not None else 0
+ _KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ']
def _real_extract(self, url):
video_id = self._match_id(url)
- api_url = 'http://api.digitalaccess.ru/api/json/'
-
data = {
'method': 'da.content.get',
'params': [
@@ -76,11 +69,10 @@ class IviIE(InfoExtractor):
]
}
- request = sanitized_Request(api_url, json.dumps(data))
-
- video_json_page = self._download_webpage(
+ request = sanitized_Request(
+ 'http://api.digitalaccess.ru/api/json/', json.dumps(data))
+ video_json = self._download_json(
request, video_id, 'Downloading video JSON')
- video_json = json.loads(video_json_page)
if 'error' in video_json:
error = video_json['error']
@@ -95,35 +87,51 @@ class IviIE(InfoExtractor):
formats = [{
'url': x['url'],
'format_id': x['content_format'],
- 'preference': self._known_formats.index(x['content_format']),
- } for x in result['files'] if x['content_format'] in self._known_formats]
+ 'preference': self._KNOWN_FORMATS.index(x['content_format']),
+ } for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS]
self._sort_formats(formats)
- if not formats:
- raise ExtractorError('No media links available for %s' % video_id)
-
- duration = result['duration']
- compilation = result['compilation']
title = result['title']
+ duration = int_or_none(result.get('duration'))
+ compilation = result.get('compilation')
+ episode = title if compilation else None
+
title = '%s - %s' % (compilation, title) if compilation is not None else title
- previews = result['preview']
- previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format']))
- thumbnail = previews[-1]['url'] if len(previews) > 0 else None
+ thumbnails = [{
+ 'url': preview['url'],
+ 'id': preview.get('content_format'),
+ } for preview in result.get('preview', []) if preview.get('url')]
+
+ webpage = self._download_webpage(url, video_id)
+
+ season = self._search_regex(
+ r'<li[^>]+class="season active"[^>]*><a[^>]+>([^<]+)',
+ webpage, 'season', default=None)
+ season_number = int_or_none(self._search_regex(
+ r'<li[^>]+class="season active"[^>]*><a[^>]+data-season(?:-index)?="(\d+)"',
+ webpage, 'season number', default=None))
+
+ episode_number = int_or_none(self._search_regex(
+ r'<meta[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)',
+ webpage, 'episode number', default=None))
- video_page = self._download_webpage(url, video_id, 'Downloading video page')
- description = self._extract_description(video_page)
- comment_count = self._extract_comment_count(video_page)
+ description = self._og_search_description(webpage, default=None) or self._html_search_meta(
+ 'description', webpage, 'description', default=None)
return {
'id': video_id,
'title': title,
- 'thumbnail': thumbnail,
+ 'series': compilation,
+ 'season': season,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'thumbnails': thumbnails,
'description': description,
'duration': duration,
- 'comment_count': comment_count,
'formats': formats,
}
@@ -149,8 +157,11 @@ class IviCompilationIE(InfoExtractor):
}]
def _extract_entries(self, html, compilation_id):
- return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi')
- for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)]
+ return [
+ self.url_result(
+ 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key())
+ for serie in re.findall(
+ r'<a href="/watch/%s/(\d+)"[^>]+data-id="\1"' % compilation_id, html)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -158,7 +169,8 @@ class IviCompilationIE(InfoExtractor):
season_id = mobj.group('seasonid')
if season_id is not None: # Season link
- season_page = self._download_webpage(url, compilation_id, 'Downloading season %s web page' % season_id)
+ season_page = self._download_webpage(
+ url, compilation_id, 'Downloading season %s web page' % season_id)
playlist_id = '%s/season%s' % (compilation_id, season_id)
playlist_title = self._html_search_meta('title', season_page, 'title')
entries = self._extract_entries(season_page, compilation_id)
@@ -166,8 +178,9 @@ class IviCompilationIE(InfoExtractor):
compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page')
playlist_id = compilation_id
playlist_title = self._html_search_meta('title', compilation_page, 'title')
- seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page)
- if len(seasons) == 0: # No seasons in this compilation
+ seasons = re.findall(
+ r'<a href="/watch/%s/season(\d+)' % compilation_id, compilation_page)
+ if not seasons: # No seasons in this compilation
entries = self._extract_entries(compilation_page, compilation_id)
else:
entries = []
diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py
new file mode 100644
index 000000000..617dc8c07
--- /dev/null
+++ b/youtube_dl/extractor/ivideon.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urlparse,
+)
+from ..utils import qualities
+
+
+class IvideonIE(InfoExtractor):
+ IE_NAME = 'ivideon'
+ IE_DESC = 'Ivideon TV'
+ _VALID_URL = r'https?://(?:www\.)?ivideon\.com/tv/(?:[^/]+/)*camera/(?P<id>\d+-[\da-f]+)/(?P<camera_id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.ivideon.com/tv/camera/100-916ca13b5c4ad9f564266424a026386d/0/',
+ 'info_dict': {
+ 'id': '100-916ca13b5c4ad9f564266424a026386d',
+ 'ext': 'flv',
+ 'title': 're:^Касса [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'Основное предназначение - запись действий кассиров. Плюс общий вид.',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ivideon.com/tv/map/22.917923/-31.816406/16/camera/100-e7bc16c7d4b5bbd633fd5350b66dfa9a/0',
+ 'only_matching': True,
+ }]
+
+ _QUALITIES = ('low', 'mid', 'hi')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ server_id, camera_id = mobj.group('id'), mobj.group('camera_id')
+ camera_name, description = None, None
+ camera_url = compat_urlparse.urljoin(
+ url, '/tv/camera/%s/%s/' % (server_id, camera_id))
+
+ webpage = self._download_webpage(camera_url, server_id, fatal=False)
+ if webpage:
+ config_string = self._search_regex(
+ r'var\s+config\s*=\s*({.+?});', webpage, 'config', default=None)
+ if config_string:
+ config = self._parse_json(config_string, server_id, fatal=False)
+ camera_info = config.get('ivTvAppOptions', {}).get('currentCameraInfo')
+ if camera_info:
+ camera_name = camera_info.get('camera_name')
+ description = camera_info.get('misc', {}).get('description')
+ if not camera_name:
+ camera_name = self._html_search_meta(
+ 'name', webpage, 'camera name', default=None) or self._search_regex(
+ r'<h1[^>]+class="b-video-title"[^>]*>([^<]+)', webpage, 'camera name', default=None)
+
+ quality = qualities(self._QUALITIES)
+
+ formats = [{
+ 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({
+ 'server': server_id,
+ 'camera': camera_id,
+ 'sessionId': 'demo',
+ 'q': quality(format_id),
+ }),
+ 'format_id': format_id,
+ 'ext': 'flv',
+ 'quality': quality(format_id),
+ } for format_id in self._QUALITIES]
+ self._sort_formats(formats)
+
+ return {
+ 'id': server_id,
+ 'title': self._live_title(camera_name or server_id),
+ 'description': description,
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py
deleted file mode 100644
index da8068efc..000000000
--- a/youtube_dl/extractor/jukebox.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- RegexNotFoundError,
- unescapeHTML,
-)
-
-
-class JukeboxIE(InfoExtractor):
- _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
- _TEST = {
- 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
- 'info_dict': {
- 'id': 'r303r',
- 'ext': 'flv',
- 'title': 'Kosheen-En Vivo Pride',
- 'uploader': 'Kosheen',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- html = self._download_webpage(url, video_id)
- iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
-
- iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
- if re.search(r'class="jkb_waiting"', iframe_html) is not None:
- raise ExtractorError('Video is not available(in your country?)!')
-
- self.report_extraction(video_id)
-
- try:
- video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
- iframe_html, 'video url')
- video_url = unescapeHTML(video_url).replace('\/', '/')
- except RegexNotFoundError:
- youtube_url = self._search_regex(
- r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
- iframe_html, 'youtube url')
- youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
- self.to_screen('Youtube video detected')
- return self.url_result(youtube_url, ie='Youtube')
-
- title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
- html, 'title')
- artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
- html, 'artist')
-
- return {
- 'id': video_id,
- 'url': video_url,
- 'title': artist + '-' + title,
- 'uploader': artist,
- }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 88334889e..425fc9e2a 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -17,7 +17,7 @@ class MDRIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
_TESTS = [{
- # MDR regularily deletes its videos
+ # MDR regularly deletes its videos
'url': 'http://www.mdr.de/fakt/video189002.html',
'only_matching': True,
}, {
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index d887583e6..e8bb527b8 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -167,14 +167,16 @@ class MTVServicesInfoExtractor(InfoExtractor):
'description': description,
}
+ def _get_feed_query(self, uri):
+ data = {'uri': uri}
+ if self._LANG:
+ data['lang'] = self._LANG
+ return compat_urllib_parse.urlencode(data)
+
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
feed_url = self._get_feed_url(uri)
- data = compat_urllib_parse.urlencode({'uri': uri})
- info_url = feed_url + '?'
- if self._LANG:
- info_url += 'lang=%s&' % self._LANG
- info_url += data
+ info_url = feed_url + '?' + self._get_feed_query(uri)
return self._get_videos_info_from_url(info_url, video_id)
def _get_videos_info_from_url(self, url, video_id):
@@ -184,9 +186,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
return self.playlist_result(
[self._get_video_info(item) for item in idoc.findall('.//item')])
- def _real_extract(self, url):
- title = url_basename(url)
- webpage = self._download_webpage(url, title)
+ def _extract_mgid(self, webpage):
try:
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
# or http://media.mtvnservices.com/{mgid}
@@ -207,7 +207,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
'sm4:video:embed', webpage, 'sm4 embed', default='')
mgid = self._search_regex(
r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid')
+ return mgid
+ def _real_extract(self, url):
+ title = url_basename(url)
+ webpage = self._download_webpage(url, title)
+ mgid = self._extract_mgid(webpage)
videos_info = self._get_videos_info(mgid)
return videos_info
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 340c922bd..1dd54c2f1 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -100,7 +100,7 @@ class NBCSportsVPlayerIE(InfoExtractor):
class NBCSportsIE(InfoExtractor):
- # Does not include https becuase its certificate is invalid
+ # Does not include https because its certificate is invalid
_VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
_TEST = {
diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py
new file mode 100644
index 000000000..657ae77a0
--- /dev/null
+++ b/youtube_dl/extractor/nextmovie.py
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+from ..compat import compat_urllib_parse
+
+
+class NextMovieIE(MTVServicesInfoExtractor):
+ IE_NAME = 'nextmovie.com'
+ _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)'
+ _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm'
+ _TESTS = [{
+ 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/',
+ 'md5': '09a9199f2f11f10107d04fcb153218aa',
+ 'info_dict': {
+ 'id': '961726',
+ 'ext': 'mp4',
+ 'title': 'The Muppets\' Gravity',
+ },
+ }]
+
+ def _get_feed_query(self, uri):
+ return compat_urllib_parse.urlencode({
+ 'feed': '1505',
+ 'mgid': uri,
+ })
+
+ def _real_extract(self, url):
+ mgid = self._match_id(url)
+ return self._get_videos_info(mgid)
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index e98a5ef89..8d5ce46ad 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -223,7 +223,7 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
response = self._download_webpage(request_url, playlist_title)
response = self._fix_json(response)
if not response.strip():
- self._downloader.report_warning('Got an empty reponse, trying '
+ self._downloader.report_warning('Got an empty response, trying '
'adding the "newvideos" parameter')
response = self._download_webpage(request_url + '&newvideos=true',
playlist_title)
diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py
new file mode 100644
index 000000000..b62819ae5
--- /dev/null
+++ b/youtube_dl/extractor/nick.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+from ..compat import compat_urllib_parse
+
+
+class NickIE(MTVServicesInfoExtractor):
+ IE_NAME = 'nick.com'
+ _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)'
+ _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
+ _TESTS = [{
+ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
+ 'playlist': [
+ {
+ 'md5': '6e5adc1e28253bbb1b28ab05403dd4d4',
+ 'info_dict': {
+ 'id': 'be6a17b0-412d-11e5-8ff7-0026b9414f30',
+ 'ext': 'mp4',
+ 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S1',
+ 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+
+ }
+ },
+ {
+ 'md5': 'd7be441fc53a1d4882fa9508a1e5b3ce',
+ 'info_dict': {
+ 'id': 'be6b8f96-412d-11e5-8ff7-0026b9414f30',
+ 'ext': 'mp4',
+ 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S2',
+ 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+
+ }
+ },
+ {
+ 'md5': 'efffe1728a234b2b0d2f2b343dd1946f',
+ 'info_dict': {
+ 'id': 'be6cf7e6-412d-11e5-8ff7-0026b9414f30',
+ 'ext': 'mp4',
+ 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S3',
+ 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+ }
+ },
+ {
+ 'md5': '1ec6690733ab9f41709e274a1d5c7556',
+ 'info_dict': {
+ 'id': 'be6e3354-412d-11e5-8ff7-0026b9414f30',
+ 'ext': 'mp4',
+ 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S4',
+ 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+ }
+ },
+ ],
+ }]
+
+ def _get_feed_query(self, uri):
+ return compat_urllib_parse.urlencode({
+ 'feed': 'nick_arc_player_prime',
+ 'mgid': uri,
+ })
+
+ def _extract_mgid(self, webpage):
+ return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid')
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
index fd107aca2..916a102bf 100644
--- a/youtube_dl/extractor/nowtv.py
+++ b/youtube_dl/extractor/nowtv.py
@@ -71,6 +71,7 @@ class NowTVBaseIE(InfoExtractor):
class NowTVIE(NowTVBaseIE):
+ _WORKING = False
_VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)'
_TESTS = [{
diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py
new file mode 100644
index 000000000..125c7010b
--- /dev/null
+++ b/youtube_dl/extractor/npr.py
@@ -0,0 +1,82 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ int_or_none,
+ qualities,
+)
+
+
+class NprIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205',
+ 'info_dict': {
+ 'id': '449974205',
+ 'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More'
+ },
+ 'playlist_count': 7,
+ }, {
+ 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?action=1&t=1&islist=false&id=446928052&m=446929930&live=1',
+ 'info_dict': {
+ 'id': '446928052',
+ 'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'"
+ },
+ 'playlist': [{
+ 'md5': '12fa60cb2d3ed932f53609d4aeceabf1',
+ 'info_dict': {
+ 'id': '446929930',
+ 'ext': 'mp3',
+ 'title': 'Your Mercy is Boundless (Bazum en Qo gtutyunqd)',
+ 'duration': 402,
+ },
+ }],
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ config = self._download_json(
+ 'http://api.npr.org/query?%s' % compat_urllib_parse.urlencode({
+ 'id': playlist_id,
+ 'fields': 'titles,audio,show',
+ 'format': 'json',
+ 'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010',
+ }), playlist_id)
+
+ story = config['list']['story'][0]
+
+ KNOWN_FORMATS = ('threegp', 'mp4', 'mp3')
+ quality = qualities(KNOWN_FORMATS)
+
+ entries = []
+ for audio in story.get('audio', []):
+ title = audio.get('title', {}).get('$text')
+ duration = int_or_none(audio.get('duration', {}).get('$text'))
+ formats = []
+ for format_id, formats_entry in audio.get('format', {}).items():
+ if not formats_entry:
+ continue
+ if isinstance(formats_entry, list):
+ formats_entry = formats_entry[0]
+ format_url = formats_entry.get('$text')
+ if not format_url:
+ continue
+ if format_id in KNOWN_FORMATS:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'ext': formats_entry.get('type'),
+ 'quality': quality(format_id),
+ })
+ self._sort_formats(formats)
+ entries.append({
+ 'id': audio['id'],
+ 'title': title,
+ 'duration': duration,
+ 'formats': formats,
+ })
+
+ playlist_title = story.get('title', {}).get('$text')
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py
index d2cfe0961..a83e85cb8 100644
--- a/youtube_dl/extractor/ntvde.py
+++ b/youtube_dl/extractor/ntvde.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
int_or_none,
js_to_json,
@@ -34,7 +35,7 @@ class NTVDeIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
info = self._parse_json(self._search_regex(
- r'(?s)ntv.pageInfo.article =\s(\{.*?\});', webpage, 'info'),
+ r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'),
video_id, transform_source=js_to_json)
timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp'))
vdata = self._parse_json(self._search_regex(
@@ -42,18 +43,24 @@ class NTVDeIE(InfoExtractor):
webpage, 'player data'),
video_id, transform_source=js_to_json)
duration = parse_duration(vdata.get('duration'))
- formats = [{
- 'format_id': 'flash',
- 'url': 'rtmp://fms.n-tv.de/' + vdata['video'],
- }, {
- 'format_id': 'mobile',
- 'url': 'http://video.n-tv.de' + vdata['videoMp4'],
- 'tbr': 400, # estimation
- }]
- m3u8_url = 'http://video.n-tv.de' + vdata['videoM3u8']
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, ext='mp4',
- entry_protocol='m3u8_native', preference=0))
+
+ formats = []
+ if vdata.get('video'):
+ formats.append({
+ 'format_id': 'flash',
+ 'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'],
+ })
+ if vdata.get('videoMp4'):
+ formats.append({
+ 'format_id': 'mobile',
+ 'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']),
+ 'tbr': 400, # estimation
+ })
+ if vdata.get('videoM3u8'):
+ m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8'])
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+ preference=0, m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 3960d522e..20b984288 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -13,6 +13,9 @@ from ..compat import compat_urllib_parse
class OoyalaBaseIE(InfoExtractor):
+ _PLAYER_BASE = 'http://player.ooyala.com/'
+ _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
+ _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?'
def _extract(self, content_tree_url, video_id, domain='example.org'):
content_tree = self._download_json(content_tree_url, video_id)['content_tree']
@@ -31,24 +34,33 @@ class OoyalaBaseIE(InfoExtractor):
formats = []
for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'):
auth_data = self._download_json(
- 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?' % (pcode, embed_code) + compat_urllib_parse.urlencode({'domain': domain, 'supportedFormats': supported_format}),
+ self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
+ compat_urllib_parse.urlencode({
+ 'domain': domain,
+ 'supportedFormats': supported_format
+ }),
video_id, 'Downloading %s JSON' % supported_format)
cur_auth_data = auth_data['authorization_data'][embed_code]
if cur_auth_data['authorized']:
for stream in cur_auth_data['streams']:
- url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8')
+ url = base64.b64decode(
+ stream['url']['data'].encode('ascii')).decode('utf-8')
if url in urls:
continue
urls.append(url)
delivery_type = stream['delivery_type']
if delivery_type == 'hls' or '.m3u8' in url:
- formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ url, embed_code, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
elif delivery_type == 'hds' or '.f4m' in url:
- formats.extend(self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
elif '.smil' in url:
- formats.extend(self._extract_smil_formats(url, embed_code, fatal=False))
+ formats.extend(self._extract_smil_formats(
+ url, embed_code, fatal=False))
else:
formats.append({
'url': url,
@@ -62,7 +74,8 @@ class OoyalaBaseIE(InfoExtractor):
'fps': float_or_none(stream.get('framerate')),
})
else:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, cur_auth_data['message']), expected=True)
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, cur_auth_data['message']), expected=True)
self._sort_formats(formats)
video_info['formats'] = formats
@@ -120,7 +133,7 @@ class OoyalaIE(OoyalaBaseIE):
url, smuggled_data = unsmuggle_url(url, {})
embed_code = self._match_id(url)
domain = smuggled_data.get('domain')
- content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/embed_code/%s/%s' % (embed_code, embed_code)
+ content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code)
return self._extract(content_tree_url, embed_code, domain)
@@ -147,7 +160,7 @@ class OoyalaExternalIE(OoyalaBaseIE):
'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
'ext': 'mp4',
'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
- 'duration': 1302000,
+ 'duration': 1302.0,
},
'params': {
# m3u8 download
@@ -157,5 +170,5 @@ class OoyalaExternalIE(OoyalaBaseIE):
def _real_extract(self, url):
partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups()
- content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/external_id/%s/%s:%s' % (pcode, partner_id, video_id)
+ content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id)
return self._extract(content_tree_url, video_id)
diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py
new file mode 100644
index 000000000..02de1502a
--- /dev/null
+++ b/youtube_dl/extractor/ora.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ get_element_by_attribute,
+ qualities,
+ unescapeHTML,
+)
+
+
+class OraTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)'
+ _TEST = {
+ 'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',
+ 'md5': 'fa33717591c631ec93b04b0e330df786',
+ 'info_dict': {
+ 'id': '50178',
+ 'ext': 'mp4',
+ 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',
+ 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',
+ 'duration': 1477,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ video_data = self._search_regex(
+ r'"current"\s*:\s*({[^}]+?})', webpage, 'current video')
+ m3u8_url = self._search_regex(
+ r'"hls_stream"\s*:\s*"([^"]+)', video_data, 'm3u8 url', None)
+ if m3u8_url:
+ formats = self._extract_m3u8_formats(
+ m3u8_url, display_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False)
+ # similar to GameSpotIE
+ m3u8_path = compat_urlparse.urlparse(m3u8_url).path
+ QUALITIES_RE = r'((,[a-z]+\d+)+,?)'
+ available_qualities = self._search_regex(
+ QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',')
+ http_path = m3u8_path[1:].split('/', 1)[1]
+ http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+ http_template = http_template.replace('.csmil/master.m3u8', '')
+ http_template = compat_urlparse.urljoin(
+ 'http://videocdn-pmd.ora.tv/', http_template)
+ preference = qualities(
+ ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080'])
+ for q in available_qualities:
+ formats.append({
+ 'url': http_template % q,
+ 'format_id': q,
+ 'preference': preference(q),
+ })
+ self._sort_formats(formats)
+ else:
+ return self.url_result(self._search_regex(
+ r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube')
+
+ return {
+ 'id': self._search_regex(
+ r'"video_id"\s*:\s*(\d+)', video_data, 'video id'),
+ 'display_id': display_id,
+ 'title': unescapeHTML(self._og_search_title(webpage)),
+ 'description': get_element_by_attribute(
+ 'class', 'video_txt_decription', webpage),
+ 'thumbnail': self._proto_relative_url(self._search_regex(
+ r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)),
+ 'duration': int(self._search_regex(
+ r'"duration"\s*:\s*(\d+)', video_data, 'duration')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 2e6c9872b..c54775d54 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -170,7 +170,21 @@ class ORFOE1IE(InfoExtractor):
class ORFFM4IE(InfoExtractor):
IE_NAME = 'orf:fm4'
IE_DESC = 'radio FM4'
- _VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)'
+ _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://fm4.orf.at/player/20160110/IS/',
+ 'md5': '01e736e8f1cef7e13246e880a59ad298',
+ 'info_dict': {
+ 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244',
+ 'ext': 'mp3',
+ 'title': 'Im Sumpf',
+ 'description': 'md5:384c543f866c4e422a55f66a62d669cd',
+ 'duration': 7173,
+ 'timestamp': 1452456073,
+ 'upload_date': '20160110',
+ },
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py
new file mode 100644
index 000000000..8d49f5c4a
--- /dev/null
+++ b/youtube_dl/extractor/pandoratv.py
@@ -0,0 +1,78 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ parse_duration,
+ str_to_int,
+)
+
+
+class PandoraTVIE(InfoExtractor):
+ IE_NAME = 'pandora.tv'
+ IE_DESC = '판도라TV'
+ _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?'
+ _TEST = {
+ 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2',
+ 'info_dict': {
+ 'id': '53294230',
+ 'ext': 'flv',
+ 'title': '頭を撫でてくれる?',
+ 'description': '頭を撫でてくれる?',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 39,
+ 'upload_date': '20151218',
+ 'uploader': 'カワイイ動物まとめ',
+ 'uploader_id': 'mikakim',
+ 'view_count': int,
+ 'like_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ video_id = qs.get('prgid', [None])[0]
+ user_id = qs.get('ch_userid', [None])[0]
+ if any(not f for f in (video_id, user_id,)):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ data = self._download_json(
+ 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s'
+ % (user_id, video_id), video_id)
+
+ info = data['data']['rows']['vod_play_info']['result']
+
+ formats = []
+ for format_id, format_url in info.items():
+ if not format_url:
+ continue
+ height = self._search_regex(
+ r'^v(\d+)[Uu]rl$', format_id, 'height', default=None)
+ if not height:
+ continue
+ formats.append({
+ 'format_id': '%sp' % height,
+ 'url': format_url,
+ 'height': int(height),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info['subject'],
+ 'description': info.get('body'),
+ 'thumbnail': info.get('thumbnail') or info.get('poster'),
+ 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')),
+ 'upload_date': info['fid'][:8] if isinstance(info.get('fid'), compat_str) else None,
+ 'uploader': info.get('nickname'),
+ 'uploader_id': info.get('upload_userid'),
+ 'view_count': str_to_int(info.get('hit')),
+ 'like_count': str_to_int(info.get('likecnt')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
index 55c11b3bf..12e1c2862 100644
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -232,7 +232,7 @@ class PluralsightIE(PluralsightBaseIE):
# { a = author, cn = clip_id, lc = end, m = name }
return {
- 'id': clip['clipName'],
+ 'id': clip.get('clipName') or clip['name'],
'title': '%s - %s' % (module['title'], clip['title']),
'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')),
'creator': author,
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index baa54a3af..670e6950f 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -20,7 +20,7 @@ from ..utils import (
class ProSiebenSat1IE(InfoExtractor):
IE_NAME = 'prosiebensat1'
IE_DESC = 'ProSiebenSat.1 Digital'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany|7tv)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)'
_TESTS = [
{
@@ -32,7 +32,7 @@ class ProSiebenSat1IE(InfoExtractor):
'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
'info_dict': {
'id': '2104602',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Episode 18 - Staffel 2',
'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
'upload_date': '20131231',
@@ -138,14 +138,13 @@ class ProSiebenSat1IE(InfoExtractor):
'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
'info_dict': {
'id': '2572814',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Andreas Kümmert: Rocket Man',
'description': 'md5:6ddb02b0781c6adf778afea606652e38',
'upload_date': '20131017',
'duration': 469.88,
},
'params': {
- # rtmp download
'skip_download': True,
},
},
@@ -153,13 +152,12 @@ class ProSiebenSat1IE(InfoExtractor):
'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
'info_dict': {
'id': '2156342',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Kurztrips zum Valentinstag',
- 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',
+ 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.',
'duration': 307.24,
},
'params': {
- # rtmp download
'skip_download': True,
},
},
@@ -172,12 +170,26 @@ class ProSiebenSat1IE(InfoExtractor):
},
'playlist_count': 2,
},
+ {
+ 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge',
+ 'info_dict': {
+ 'id': '4187506',
+ 'ext': 'flv',
+ 'title': 'Best of Circus HalliGalli',
+ 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9',
+ 'upload_date': '20151229',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
_CLIPID_REGEXES = [
r'"clip_id"\s*:\s+"(\d+)"',
r'clipid: "(\d+)"',
r'clip[iI]d=(\d+)',
+ r'clip[iI]d\s*=\s*["\'](\d+)',
r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)",
]
_TITLE_REGEXES = [
@@ -186,12 +198,16 @@ class ProSiebenSat1IE(InfoExtractor):
r'<!-- start video -->\s*<h1>(.+?)</h1>',
r'<h1 class="att-name">\s*(.+?)</h1>',
r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
+ r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>',
+ r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
r'<p class="att-description">\s*(.+?)\s*</p>',
+ r'<p class="video-description" itemprop="description">\s*(.+?)</p>',
+ r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>',
]
_UPLOAD_DATE_REGEXES = [
r'<meta property="og:published_time" content="(.+?)">',
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 1ba3bbddf..45a3c41c5 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -11,6 +11,7 @@ from ..utils import (
strip_jsonp,
unescapeHTML,
clean_html,
+ ExtractorError,
)
@@ -177,7 +178,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
'info_dict': {
'id': '001BLpXF2DyJe2',
'title': '林俊杰',
- 'description': 'md5:2a222d89ba4455a3af19940c0481bb78',
+ 'description': 'md5:870ec08f7d8547c29c93010899103751',
},
'playlist_count': 12,
}
@@ -272,7 +273,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
'url': 'http://y.qq.com/#type=toplist&p=top_3',
'info_dict': {
'id': 'top_3',
- 'title': 'QQ音乐巅峰榜·欧美',
+ 'title': '巅峰榜·欧美',
'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
'计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
'歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
@@ -315,7 +316,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
IE_DESC = 'QQ音乐 - 歌单'
_VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://y.qq.com/#type=taoge&id=3462654915',
'info_dict': {
'id': '3462654915',
@@ -323,7 +324,16 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4',
},
'playlist_count': 40,
- }
+ 'skip': 'playlist gone',
+ }, {
+ 'url': 'http://y.qq.com/#type=taoge&id=1374105607',
+ 'info_dict': {
+ 'id': '1374105607',
+ 'title': '易入人心的华语民谣',
+ 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。',
+ },
+ 'playlist_count': 20,
+ }]
def _real_extract(self, url):
list_id = self._match_id(url)
@@ -331,14 +341,21 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
list_json = self._download_json(
'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
% list_id, list_id, 'Download list page',
- transform_source=strip_jsonp)['cdlist'][0]
-
+ transform_source=strip_jsonp)
+ if not len(list_json.get('cdlist', [])):
+ if list_json.get('code'):
+ raise ExtractorError(
+ 'QQ Music said: error %d in fetching playlist info' % list_json['code'],
+ expected=True)
+ raise ExtractorError('Unable to get playlist info')
+
+ cdlist = list_json['cdlist'][0]
entries = [
self.url_result(
'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
- ) for song in list_json['songlist']
+ ) for song in cdlist['songlist']
]
- list_name = list_json.get('dissname')
- list_description = clean_html(unescapeHTML(list_json.get('desc')))
+ list_name = cdlist.get('dissname')
+ list_description = clean_html(unescapeHTML(cdlist.get('desc')))
return self.playlist_result(entries, list_id, list_name, list_description)
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index f2679591b..a4dc5c335 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -107,7 +107,8 @@ class RaiTVIE(InfoExtractor):
return xml.replace(' tag elementi', '').replace('>/', '</')
relinker = self._download_xml(
- media['mediaUri'] + '&output=43', video_id, transform_source=fix_xml)
+ media['mediaUri'] + '&output=43',
+ video_id, transform_source=fix_xml)
has_subtitle = False
@@ -117,8 +118,8 @@ class RaiTVIE(InfoExtractor):
content_type = xpath_text(element, 'content-type')
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls',
- fatal=False))
+ media_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
diff --git a/youtube_dl/extractor/regiotv.py b/youtube_dl/extractor/regiotv.py
new file mode 100644
index 000000000..e250a52f0
--- /dev/null
+++ b/youtube_dl/extractor/regiotv.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ sanitized_Request,
+ xpath_text,
+ xpath_with_ns,
+)
+
+
+class RegioTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.regio-tv.de/video/395808.html',
+ 'info_dict': {
+ 'id': '395808',
+ 'ext': 'mp4',
+ 'title': 'Wir in Ludwigsburg',
+ 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!',
+ }
+ }, {
+ 'url': 'http://www.regio-tv.de/video/395808',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ key = self._search_regex(
+ r'key\s*:\s*(["\'])(?P<key>.+?)\1', webpage, 'key', group='key')
+ title = self._og_search_title(webpage)
+
+ SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>'
+
+ request = sanitized_Request(
+ 'http://v.telvi.de/',
+ SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8'))
+ video_data = self._download_xml(request, video_id, 'Downloading video XML')
+
+ NS_MAP = {
+ 'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+ 'soap': 'http://schemas.xmlsoap.org/soap/envelope/',
+ }
+
+ video_url = xpath_text(
+ video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True)
+ thumbnail = xpath_text(
+ video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail')
+ description = self._og_search_description(
+ webpage) or self._html_search_meta('description', webpage)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py
new file mode 100644
index 000000000..b1b8800b9
--- /dev/null
+++ b/youtube_dl/extractor/revision3.py
@@ -0,0 +1,127 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ unescapeHTML,
+ qualities,
+)
+
+
+class Revision3IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
+ _TESTS = [{
+ 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
+ 'md5': 'd94a72d85d0a829766de4deb8daaf7df',
+ 'info_dict': {
+ 'id': '73034',
+ 'display_id': 'technobuffalo/5-google-predictions-for-2016',
+ 'ext': 'webm',
+ 'title': '5 Google Predictions for 2016',
+ 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.',
+ 'upload_date': '20151228',
+ 'timestamp': 1451325600,
+ 'duration': 187,
+ 'uploader': 'TechnoBuffalo',
+ 'uploader_id': 'technobuffalo',
+ }
+ }, {
+ 'url': 'http://testtube.com/brainstuff',
+ 'info_dict': {
+ 'id': '251',
+ 'title': 'BrainStuff',
+ 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.',
+ },
+ 'playlist_mincount': 93,
+ }, {
+ 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
+ 'info_dict': {
+ 'id': '60163',
+ 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
+ 'duration': 275,
+ 'ext': 'webm',
+ 'title': '5 Weird Ways Plants Can Eat Animals',
+ 'description': 'Why have some plants evolved to eat meat?',
+ 'upload_date': '20150120',
+ 'timestamp': 1421763300,
+ 'uploader': 'DNews',
+ 'uploader_id': 'dnews',
+ },
+ }]
+ _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
+ _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
+
+ def _real_extract(self, url):
+ domain, display_id = re.match(self._VALID_URL, url).groups()
+ page_info = self._download_json(
+ self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id)
+
+ if page_info['data']['type'] == 'episode':
+ episode_data = page_info['data']
+ video_id = compat_str(episode_data['video']['data']['id'])
+ video_data = self._download_json(
+ 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
+ video_id)['items'][0]
+
+ formats = []
+ for vcodec, media in video_data['media'].items():
+ for quality_id, quality in media.items():
+ if quality_id == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ quality['url'], video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': quality['url'],
+ 'format_id': '%s-%s' % (vcodec, quality_id),
+ 'tbr': int_or_none(quality.get('bitrate')),
+ 'vcodec': vcodec,
+ })
+ self._sort_formats(formats)
+
+ preference = qualities(['mini', 'small', 'medium', 'large'])
+ thumbnails = [{
+ 'url': image_url,
+ 'id': image_id,
+ 'preference': preference(image_id)
+ } for image_id, image_url in video_data.get('images', {}).items()]
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': unescapeHTML(video_data['title']),
+ 'description': unescapeHTML(video_data.get('summary')),
+ 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '),
+ 'author': episode_data.get('author'),
+ 'uploader': video_data.get('show', {}).get('name'),
+ 'uploader_id': video_data.get('show', {}).get('slug'),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
+ else:
+ show_data = page_info['show']['data']
+ episodes_data = page_info['episodes']['data']
+ num_episodes = page_info['meta']['totalEpisodes']
+ processed_episodes = 0
+ entries = []
+ page_num = 1
+ while True:
+ entries.extend([self.url_result(
+ 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data])
+ processed_episodes += len(episodes_data)
+ if processed_episodes == num_episodes:
+ break
+ page_num += 1
+ episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % (
+ domain, display_id + '/' + compat_str(page_num), domain),
+ display_id)['episodes']['data']
+
+ return self.playlist_result(
+ entries, compat_str(show_data['id']),
+ show_data.get('name'), show_data.get('summary'))
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
index d9cfbf180..47c8331fe 100644
--- a/youtube_dl/extractor/rte.py
+++ b/youtube_dl/extractor/rte.py
@@ -2,19 +2,22 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-
from ..utils import (
float_or_none,
+ parse_iso8601,
+ unescapeHTML,
)
class RteIE(InfoExtractor):
+ IE_NAME = 'rte'
+ IE_DESC = 'Raidió Teilifís Éireann TV'
_VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/',
'info_dict': {
'id': '10478715',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Watch iWitness online',
'thumbnail': 're:^https?://.*\.jpg$',
'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.',
@@ -44,13 +47,6 @@ class RteIE(InfoExtractor):
# f4m_url = server + relative_url
f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url']
f4m_formats = self._extract_f4m_formats(f4m_url, video_id)
- f4m_formats = [{
- 'format_id': f['format_id'],
- 'url': f['url'],
- 'ext': 'mp4',
- 'width': f['width'],
- 'height': f['height'],
- } for f in f4m_formats]
return {
'id': video_id,
@@ -60,3 +56,73 @@ class RteIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
}
+
+
+class RteRadioIE(InfoExtractor):
+ IE_NAME = 'rte:radio'
+ IE_DESC = 'Raidió Teilifís Éireann radio'
+ # Radioplayer URLs have the specifier #!rii=<channel_id>:<id>:<playable_item_id>:<date>:
+ # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated.
+ # An <id> uniquely defines an individual recording, and is the only part we require.
+ _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:[0-9]*)(?:%3A|:)(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:',
+ 'info_dict': {
+ 'id': '10507902',
+ 'ext': 'mp4',
+ 'title': 'Gloria',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0',
+ 'timestamp': 1451203200,
+ 'upload_date': '20151227',
+ 'duration': 7230.0,
+ },
+ 'params': {
+ 'skip_download': 'f4m fails with --test atm'
+ }
+ }
+
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+
+ json_string = self._download_json(
+ 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id,
+ item_id)
+
+ # NB the string values in the JSON are stored using XML escaping(!)
+ show = json_string['shows'][0]
+ title = unescapeHTML(show['title'])
+ description = unescapeHTML(show.get('description'))
+ thumbnail = show.get('thumbnail')
+ duration = float_or_none(show.get('duration'), 1000)
+ timestamp = parse_iso8601(show.get('published'))
+
+ mg = show['media:group'][0]
+
+ formats = []
+
+ if mg.get('url') and not mg['url'].startswith('rtmpe:'):
+ formats.append({'url': mg['url']})
+
+ if mg.get('hls_server') and mg.get('hls_url'):
+ formats.extend(self._extract_m3u8_formats(
+ mg['hls_server'] + mg['hls_url'], item_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+ if mg.get('hds_server') and mg.get('hds_url'):
+ formats.extend(self._extract_f4m_formats(
+ mg['hds_server'] + mg['hds_url'], item_id,
+ f4m_id='hds', fatal=False))
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': item_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index f063ab5dd..3cc32847b 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -197,7 +197,7 @@ class RTSIE(SRGSSRIE):
media_id, f4m_id=format_id, fatal=False))
elif format_url.endswith('.m3u8'):
formats.extend(self._extract_m3u8_formats(
- format_url, media_id, 'mp4', m3u8_id=format_id, fatal=False))
+ format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False))
else:
formats.append({
'format_id': format_id,
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
index 41fddc375..ffea438cc 100644
--- a/youtube_dl/extractor/ruutu.py
+++ b/youtube_dl/extractor/ruutu.py
@@ -75,9 +75,12 @@ class RuutuIE(InfoExtractor):
preference = -1 if proto == 'rtmp' else 1
label = child.get('label')
tbr = int_or_none(child.get('bitrate'))
+ format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto
+ if not self._is_valid_url(video_url, video_id, format_id):
+ continue
width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]]
formats.append({
- 'format_id': '%s-%s' % (proto, label if label else tbr),
+ 'format_id': format_id,
'url': video_url,
'width': width,
'height': height,
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
index f76fb12c0..1178b7a27 100644
--- a/youtube_dl/extractor/shahid.py
+++ b/youtube_dl/extractor/shahid.py
@@ -73,6 +73,9 @@ class ShahidIE(InfoExtractor):
'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html'
% (video_id, api_vars['type']), video_id, 'Downloading player JSON')
+ if player.get('drm'):
+ raise ExtractorError('This video is DRM protected.', expected=True)
+
formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4')
video = self._download_json(
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 02e64e094..b2d5487ca 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -384,27 +384,24 @@ class SoundcloudUserIE(SoundcloudIE):
resource = mobj.group('rsrc') or 'all'
base_url = self._BASE_URL_MAP[resource] % user['id']
- next_href = None
+ COMMON_QUERY = {
+ 'limit': 50,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': '1',
+ }
+
+ query = COMMON_QUERY.copy()
+ query['offset'] = 0
+
+ next_href = base_url + '?' + compat_urllib_parse.urlencode(query)
entries = []
for i in itertools.count():
- if not next_href:
- data = compat_urllib_parse.urlencode({
- 'offset': i * 50,
- 'limit': 50,
- 'client_id': self._CLIENT_ID,
- 'linked_partitioning': '1',
- 'representation': 'speedy',
- })
- next_href = base_url + '?' + data
-
response = self._download_json(
next_href, uploader, 'Downloading track page %s' % (i + 1))
collection = response['collection']
-
if not collection:
- self.to_screen('%s: End page received' % uploader)
break
def resolve_permalink_url(candidates):
@@ -419,12 +416,15 @@ class SoundcloudUserIE(SoundcloudIE):
if permalink_url:
entries.append(self.url_result(permalink_url))
- if 'next_href' in response:
- next_href = response['next_href']
- if not next_href:
- break
- else:
- next_href = None
+ next_href = response.get('next_href')
+ if not next_href:
+ break
+
+ parsed_next_href = compat_urlparse.urlparse(response['next_href'])
+ qs = compat_urlparse.parse_qs(parsed_next_href.query)
+ qs.update(COMMON_QUERY)
+ next_href = compat_urlparse.urlunparse(
+ parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True)))
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py
index 2c8b4276b..4707029ca 100644
--- a/youtube_dl/extractor/srgssr.py
+++ b/youtube_dl/extractor/srgssr.py
@@ -29,7 +29,8 @@ class SRGSSRIE(InfoExtractor):
media_id)[media_type.capitalize()]
if media_data.get('block') and media_data['block'] in self._ERRORS:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, self._ERRORS[media_data['block']]), expected=True)
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, self._ERRORS[media_data['block']]), expected=True)
return media_data
@@ -56,25 +57,26 @@ class SRGSSRIE(InfoExtractor):
formats = []
for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []):
protocol = source.get('@protocol')
- if protocol in ('HTTP-HDS', 'HTTP-HLS'):
- assets = {}
- for quality in source['url']:
- assets[quality['@quality']] = quality['text']
- asset_url = assets.get('HD') or assets.get('HQ') or assets.get('SD') or assets.get('MQ') or assets.get('LQ')
- if '.f4m' in asset_url:
- formats.extend(self._extract_f4m_formats(asset_url + '?hdcore=3.4.0', media_id, f4m_id='hds', fatal=False))
- elif '.m3u8' in asset_url:
- formats.extend(self._extract_m3u8_formats(asset_url, media_id, m3u8_id='hls', fatal=False))
- else:
- for asset in source['url']:
- asset_url = asset['text']
+ for asset in source['url']:
+ asset_url = asset['text']
+ quality = asset['@quality']
+ format_id = '%s-%s' % (protocol, quality)
+ if protocol == 'HTTP-HDS':
+ formats.extend(self._extract_f4m_formats(
+ asset_url + '?hdcore=3.4.0', media_id,
+ f4m_id=format_id, fatal=False))
+ elif protocol == 'HTTP-HLS':
+ formats.extend(self._extract_m3u8_formats(
+ asset_url, media_id, 'mp4', 'm3u8_native',
+ m3u8_id=format_id, fatal=False))
+ else:
ext = None
- if asset_url.startswith('rtmp'):
+ if protocol == 'RTMP':
ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext')
formats.append({
- 'format_id': asset['@quality'],
+ 'format_id': format_id,
'url': asset_url,
- 'preference': preference(asset['@quality']),
+ 'preference': preference(quality),
'ext': ext,
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py
index fe11c20df..4e860db0a 100644
--- a/youtube_dl/extractor/tele13.py
+++ b/youtube_dl/extractor/tele13.py
@@ -46,8 +46,12 @@ class Tele13IE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- setup_js = self._search_regex(r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", webpage, 'setup code')
- sources = self._parse_json(self._search_regex(r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), display_id, js_to_json)
+ setup_js = self._search_regex(
+ r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)",
+ webpage, 'setup code')
+ sources = self._parse_json(self._search_regex(
+ r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'),
+ display_id, js_to_json)
preference = qualities(['Móvil', 'SD', 'HD'])
formats = []
@@ -57,7 +61,9 @@ class Tele13IE(InfoExtractor):
if format_url and format_url not in urls:
ext = determine_ext(format_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(format_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
elif YoutubeIE.suitable(format_url):
return self.url_result(format_url, 'Youtube')
else:
@@ -72,8 +78,11 @@ class Tele13IE(InfoExtractor):
return {
'id': display_id,
- 'title': self._search_regex(r'title\s*:\s*"([^"]+)"', setup_js, 'title'),
- 'description': self._html_search_meta('description', webpage, 'description'),
- 'thumbnail': self._search_regex(r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None),
+ 'title': self._search_regex(
+ r'title\s*:\s*"([^"]+)"', setup_js, 'title'),
+ 'description': self._html_search_meta(
+ 'description', webpage, 'description'),
+ 'thumbnail': self._search_regex(
+ r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None),
'formats': formats,
}
diff --git a/youtube_dl/extractor/testtube.py b/youtube_dl/extractor/testtube.py
deleted file mode 100644
index 26655d690..000000000
--- a/youtube_dl/extractor/testtube.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- qualities,
-)
-
-
-class TestTubeIE(InfoExtractor):
- _VALID_URL = r'https?://testtube\.com/[^/?#]+/(?P<id>[^/?#]+)'
- _TESTS = [{
- 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
- 'info_dict': {
- 'id': '60163',
- 'display_id': '5-weird-ways-plants-can-eat-animals',
- 'duration': 275,
- 'ext': 'webm',
- 'title': '5 Weird Ways Plants Can Eat Animals',
- 'description': 'Why have some plants evolved to eat meat?',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'uploader': 'DNews',
- 'uploader_id': 'dnews',
- },
- }, {
- 'url': 'https://testtube.com/iflscience/insane-jet-ski-flipping',
- 'info_dict': {
- 'id': 'fAGfJ4YjVus',
- 'ext': 'mp4',
- 'title': 'Flipping Jet-Ski Skills | Outrageous Acts of Science',
- 'uploader': 'Science Channel',
- 'uploader_id': 'ScienceChannel',
- 'upload_date': '20150203',
- 'description': 'md5:e61374030015bae1d2e22f096d4769d6',
- }
- }]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- youtube_url = self._html_search_regex(
- r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
- webpage, 'youtube iframe', default=None)
- if youtube_url:
- return self.url_result(youtube_url, 'Youtube', video_id=display_id)
-
- video_id = self._search_regex(
- r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);",
- webpage, 'video ID')
-
- all_info = self._download_json(
- 'https://testtube.com/api/getPlaylist.json?api_key=ba9c741bce1b9d8e3defcc22193f3651b8867e62&codecs=h264,vp8,theora&video_id=%s' % video_id,
- video_id)
- info = all_info['items'][0]
-
- formats = []
- for vcodec, fdatas in info['media'].items():
- for name, fdata in fdatas.items():
- formats.append({
- 'format_id': '%s-%s' % (vcodec, name),
- 'url': fdata['url'],
- 'vcodec': vcodec,
- 'tbr': fdata.get('bitrate'),
- })
- self._sort_formats(formats)
-
- duration = int_or_none(info.get('duration'))
- images = info.get('images')
- thumbnails = None
- preference = qualities(['mini', 'small', 'medium', 'large'])
- if images:
- thumbnails = [{
- 'id': thumbnail_id,
- 'url': img_url,
- 'preference': preference(thumbnail_id)
- } for thumbnail_id, img_url in images.items()]
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': info['title'],
- 'description': info.get('summary'),
- 'thumbnails': thumbnails,
- 'uploader': info.get('show', {}).get('name'),
- 'uploader_id': info.get('show', {}).get('slug'),
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py
index c7d559315..46918adb0 100644
--- a/youtube_dl/extractor/testurl.py
+++ b/youtube_dl/extractor/testurl.py
@@ -7,7 +7,7 @@ from ..utils import ExtractorError
class TestURLIE(InfoExtractor):
- """ Allows adressing of the test cases as test:yout.*be_1 """
+ """ Allows addressing of the test cases as test:yout.*be_1 """
IE_DESC = False # Do not list
_VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 0bf6726b5..10f2cad55 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -85,7 +85,7 @@ class ThePlatformBaseIE(InfoExtractor):
class ThePlatformIE(ThePlatformBaseIE):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
- (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
+ (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
_TESTS = [{
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index d6d038a8d..adc05ed5f 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -4,32 +4,9 @@ import re
from .common import InfoExtractor
from .brightcove import BrightcoveLegacyIE
-from .discovery import DiscoveryIE
from ..compat import compat_urlparse
-class TlcIE(DiscoveryIE):
- IE_NAME = 'tlc.com'
- _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
-
- # DiscoveryIE has _TESTS
- _TESTS = [{
- 'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
- 'info_dict': {
- 'id': '104493',
- 'ext': 'mp4',
- 'title': 'Too Big to Fly',
- 'description': 'Buddy has taken on a high flying task.',
- 'duration': 119,
- 'timestamp': 1393365060,
- 'upload_date': '20140225',
- },
- 'params': {
- 'skip_download': True, # requires ffmpef
- },
- }]
-
-
class TlcDeIE(InfoExtractor):
IE_NAME = 'tlc.de'
_VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)'
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index 5f7ac4b35..da3cd76f7 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -4,10 +4,16 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ unescapeHTML,
+)
class TudouIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
+ IE_NAME = 'tudou'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
_TESTS = [{
'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
'md5': '140a49ed444bd22f93330985d8475fcb',
@@ -16,6 +22,11 @@ class TudouIE(InfoExtractor):
'ext': 'f4v',
'title': '卡马乔国足开大脚长传冲吊集锦',
'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1372113489000,
+ 'description': '卡马乔卡家军,开大脚先进战术不完全集锦!',
+ 'duration': 289.04,
+ 'view_count': int,
+ 'filesize': int,
}
}, {
'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
@@ -24,10 +35,12 @@ class TudouIE(InfoExtractor):
'ext': 'f4v',
'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1349207518000,
+ 'description': 'md5:294612423894260f2dcd5c6c04fe248b',
+ 'duration': 5478.33,
+ 'view_count': int,
+ 'filesize': int,
}
- }, {
- 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html',
- 'only_matching': True,
}]
_PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
@@ -42,24 +55,20 @@ class TudouIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ item_data = self._download_json(
+ 'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)
- youku_vcode = self._search_regex(
- r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None)
+ youku_vcode = item_data.get('vcode')
if youku_vcode:
return self.url_result('youku:' + youku_vcode, ie='Youku')
- title = self._search_regex(
- r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')
- thumbnail_url = self._search_regex(
- r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)
-
- player_url = self._search_regex(
- r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',
- webpage, 'player URL', default=self._PLAYER_URL)
+ title = unescapeHTML(item_data['kw'])
+ description = item_data.get('desc')
+ thumbnail_url = item_data.get('pic')
+ view_count = int_or_none(item_data.get('playTimes'))
+ timestamp = int_or_none(item_data.get('pt'))
- segments = self._parse_json(self._search_regex(
- r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)
+ segments = self._parse_json(item_data['itemSegs'], video_id)
# It looks like the keys are the arguments that have to be passed as
# the hd field in the request url, we pick the higher
# Also, filter non-number qualities (see issue #3643).
@@ -80,8 +89,13 @@ class TudouIE(InfoExtractor):
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
+ 'description': description,
+ 'view_count': view_count,
+ 'timestamp': timestamp,
+ 'duration': float_or_none(part.get('seconds'), 1000),
+ 'filesize': int_or_none(part.get('size')),
'http_headers': {
- 'Referer': player_url,
+ 'Referer': self._PLAYER_URL,
},
}
result.append(part_info)
@@ -92,3 +106,47 @@ class TudouIE(InfoExtractor):
'id': video_id,
'title': title,
}
+
+
+class TudouPlaylistIE(InfoExtractor):
+ IE_NAME = 'tudou:playlist'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
+ _TESTS = [{
+ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
+ 'info_dict': {
+ 'id': 'zzdE77v6Mmo',
+ },
+ 'playlist_mincount': 209,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ playlist_data = self._download_json(
+ 'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
+ entries = [self.url_result(
+ 'http://www.tudou.com/programs/view/%s' % item['icode'],
+ 'Tudou', item['icode'],
+ item['kw']) for item in playlist_data['items']]
+ return self.playlist_result(entries, playlist_id)
+
+
+class TudouAlbumIE(InfoExtractor):
+ IE_NAME = 'tudou:album'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})'
+ _TESTS = [{
+ 'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
+ 'info_dict': {
+ 'id': 'v5qckFJvNJg',
+ },
+ 'playlist_mincount': 45,
+ }]
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+ album_data = self._download_json(
+ 'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
+ entries = [self.url_result(
+ 'http://www.tudou.com/programs/view/%s' % item['icode'],
+ 'Tudou', item['icode'],
+ item['kw']) for item in album_data['items']]
+ return self.playlist_result(entries, album_id)
diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py
index 1c4b6d635..343edf206 100644
--- a/youtube_dl/extractor/tv4.py
+++ b/youtube_dl/extractor/tv4.py
@@ -67,7 +67,7 @@ class TV4IE(InfoExtractor):
info = self._download_json(
'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON')
- # If is_geo_restricted is true, it doesn't neceserally mean we can't download it
+ # If is_geo_restricted is true, it doesn't necessarily mean we can't download it
if info['is_geo_restricted']:
self.report_warning('This content might not be available in your country due to licensing restrictions.')
if info['requires_subscription']:
diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py
new file mode 100644
index 000000000..b73279dec
--- /dev/null
+++ b/youtube_dl/extractor/tvland.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+
+
+class TVLandIE(MTVServicesInfoExtractor):
+ IE_NAME = 'tvland.com'
+ _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)'
+ _FEED_URL = 'http://www.tvland.com/feeds/mrss/'
+ _TESTS = [{
+ 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048',
+ 'playlist': [
+ {
+ 'md5': '227e9723b9669c05bf51098b10287aa7',
+ 'info_dict': {
+ 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd',
+ 'ext': 'mp4',
+ 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5',
+ }
+ },
+ {
+ 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b',
+ 'info_dict': {
+ 'id': 'f4279548-6e13-40dd-92e8-860d27289197',
+ 'ext': 'mp4',
+ 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5',
+ }
+ },
+ {
+ 'md5': 'fde4c3bccd7cc7e3576b338734153cec',
+ 'info_dict': {
+ 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334',
+ 'ext': 'mp4',
+ 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5',
+ }
+ },
+ {
+ 'md5': '247f6780cda6891f2e49b8ae2b10e017',
+ 'info_dict': {
+ 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960',
+ 'ext': 'mp4',
+ 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5',
+ }
+ },
+ {
+ 'md5': 'fd269f33256e47bad5eb6c40de089ff6',
+ 'info_dict': {
+ 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7',
+ 'ext': 'mp4',
+ 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5',
+ }
+ }
+ ],
+ }, {
+ 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies',
+ 'md5': 'e2c6389401cf485df26c79c247b08713',
+ 'info_dict': {
+ 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88',
+ 'ext': 'mp4',
+ 'title': 'Younger|Younger: Hilary Duff - Little Lies',
+ 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269'
+ },
+ }]
diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py
new file mode 100644
index 000000000..ca7d953b8
--- /dev/null
+++ b/youtube_dl/extractor/twentymin.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class TwentyMinutenIE(InfoExtractor):
+ IE_NAME = '20min'
+ _VALID_URL = r'https?://(?:www\.)?20min\.ch/(?:videotv/*\?.*\bvid=(?P<id>\d+)|(?:[^/]+/)*(?P<display_id>[^/#?]+))'
+ _TESTS = [{
+ # regular video
+ 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2',
+ 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c',
+ 'info_dict': {
+ 'id': '469148',
+ 'ext': 'flv',
+ 'title': '85 000 Franken für 15 perfekte Minuten',
+ 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)',
+ 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg'
+ }
+ }, {
+ # news article with video
+ 'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469',
+ 'md5': 'cd4cbb99b94130cff423e967cd275e5e',
+ 'info_dict': {
+ 'id': '469408',
+ 'display_id': '-Wir-muessen-mutig-nach-vorne-schauen--22050469',
+ 'ext': 'flv',
+ 'title': '«Wir müssen mutig nach vorne schauen»',
+ 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.',
+ 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg'
+ }
+ }, {
+ 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.20min.ch/ro/sortir/cinema/story/Grandir-au-bahut--c-est-dur-18927411',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._html_search_regex(
+ r'<h1>.*?<span>(.+?)</span></h1>',
+ webpage, 'title', default=None)
+ if not title:
+ title = remove_end(re.sub(
+ r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News')
+
+ if not video_id:
+ video_id = self._search_regex(
+ r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id')
+
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 59832b1ec..f5b5e7fd6 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -11,6 +11,7 @@ from ..utils import (
float_or_none,
int_or_none,
sanitized_Request,
+ unescapeHTML,
)
@@ -19,8 +20,6 @@ class UdemyIE(InfoExtractor):
_VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
_LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
_ORIGIN_URL = 'https://www.udemy.com'
- _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<'
- _ALREADY_ENROLLED = '>You are already taking this course.<'
_NETRC_MACHINE = 'udemy'
_TESTS = [{
@@ -37,15 +36,21 @@ class UdemyIE(InfoExtractor):
}]
def _enroll_course(self, webpage, course_id):
- enroll_url = self._search_regex(
+ checkout_url = unescapeHTML(self._search_regex(
+ r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1',
+ webpage, 'checkout url', group='url', default=None))
+ if checkout_url:
+ raise ExtractorError(
+ 'Course %s is not free. You have to pay for it before you can download. '
+ 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True)
+
+ enroll_url = unescapeHTML(self._search_regex(
r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1',
- webpage, 'enroll url', group='url',
- default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id)
- webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course')
- if self._SUCCESSFULLY_ENROLLED in webpage:
- self.to_screen('%s: Successfully enrolled in' % course_id)
- elif self._ALREADY_ENROLLED in webpage:
- self.to_screen('%s: Already enrolled in' % course_id)
+ webpage, 'enroll url', group='url', default=None))
+ if enroll_url:
+ webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course')
+ if '>You have enrolled in' in webpage:
+ self.to_screen('%s: Successfully enrolled in the course' % course_id)
def _download_lecture(self, course_id, lecture_id):
return self._download_json(
@@ -147,7 +152,7 @@ class UdemyIE(InfoExtractor):
# Error could possibly mean we are not enrolled in the course
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
self._enroll_course(webpage, course_id)
- lecture_id = self._download_lecture(course_id, lecture_id)
+ lecture = self._download_lecture(course_id, lecture_id)
else:
raise
@@ -244,10 +249,25 @@ class UdemyCourseIE(UdemyIE):
'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
course_id, 'Downloading course curriculum')
- entries = [
- self.url_result(
- 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy')
- for asset in response if asset.get('assetType') or asset.get('asset_type') == 'Video'
- ]
+ entries = []
+ chapter, chapter_number = None, None
+ for asset in response:
+ asset_type = asset.get('assetType') or asset.get('asset_type')
+ if asset_type == 'Video':
+ asset_id = asset.get('id')
+ if asset_id:
+ entry = {
+ '_type': 'url_transparent',
+ 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']),
+ 'ie_key': UdemyIE.ie_key(),
+ }
+ if chapter_number:
+ entry['chapter_number'] = chapter_number
+ if chapter:
+ entry['chapter'] = chapter
+ entries.append(entry)
+ elif asset.get('type') == 'chapter':
+ chapter_number = asset.get('index') or asset.get('object_index')
+ chapter = asset.get('title')
return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py
index c4751050e..60328123c 100644
--- a/youtube_dl/extractor/ultimedia.py
+++ b/youtube_dl/extractor/ultimedia.py
@@ -4,17 +4,30 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlparse
-from ..utils import (
- ExtractorError,
- qualities,
- unified_strdate,
- clean_html,
-)
+from ..utils import int_or_none
class UltimediaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?ultimedia\.com/
+ (?:
+ deliver/
+ (?P<embed_type>
+ generic|
+ musique
+ )
+ (?:/[^/]+)*/
+ (?:
+ src|
+ article
+ )|
+ default/index/video
+ (?P<site_type>
+ generic|
+ music
+ )
+ /id
+ )/(?P<id>[\d+a-z]+)'''
_TESTS = [{
# news
'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
@@ -23,9 +36,11 @@ class UltimediaIE(InfoExtractor):
'id': 's8uk0r',
'ext': 'mp4',
'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
- 'description': 'md5:3e5c8fd65791487333dda5db8aed32af',
'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 74,
'upload_date': '20150317',
+ 'timestamp': 1426604939,
+ 'uploader_id': '3fszv',
},
}, {
# music
@@ -34,72 +49,61 @@ class UltimediaIE(InfoExtractor):
'info_dict': {
'id': 'xvpfp8',
'ext': 'mp4',
- 'title': "Two - C'est la vie (Clip)",
- 'description': 'Two',
+ 'title': 'Two - C\'est La Vie (clip)',
'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 233,
'upload_date': '20150224',
+ 'timestamp': 1424760500,
+ 'uploader_id': '3rfzk',
},
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)',
+ webpage)
+ if mobj:
+ return mobj.group('url')
- deliver_url = self._proto_relative_url(self._search_regex(
- r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
- webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ video_type = mobj.group('embed_type') or mobj.group('site_type')
+ if video_type == 'music':
+ video_type = 'musique'
- deliver_page = self._download_webpage(
- deliver_url, video_id, 'Downloading iframe page')
+ deliver_info = self._download_json(
+ 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type),
+ video_id)
- if '>This video is currently not available' in deliver_page:
- raise ExtractorError(
- 'Video %s is currently not available' % video_id, expected=True)
+ yt_id = deliver_info.get('yt_id')
+ if yt_id:
+ return self.url_result(yt_id, 'Youtube')
- player = self._parse_json(
- self._search_regex(
- r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
- deliver_page, 'player'),
- video_id)
+ jwconf = deliver_info['jwconf']
- quality = qualities(['flash', 'html5'])
formats = []
- for mode in player['modes']:
- video_url = mode.get('config', {}).get('file')
- if not video_url:
- continue
- if re.match(r'https?://www\.youtube\.com/.+?', video_url):
- return self.url_result(video_url, 'Youtube')
+ for source in jwconf['playlist'][0]['sources']:
formats.append({
- 'url': video_url,
- 'format_id': mode.get('type'),
- 'quality': quality(mode.get('type')),
+ 'url': source['file'],
+ 'format_id': source.get('label'),
})
- self._sort_formats(formats)
-
- thumbnail = player.get('image')
- title = clean_html((
- self._html_search_regex(
- r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>',
- webpage, 'title', default=None) or
- self._search_regex(
- r"var\s+nameVideo\s*=\s*'([^']+)'",
- deliver_page, 'title')))
-
- description = clean_html(self._html_search_regex(
- r'(?s)<span>Description</span>(.+?)</p>', webpage,
- 'description', fatal=False))
+ self._sort_formats(formats)
- upload_date = unified_strdate(self._search_regex(
- r'Ajouté le\s*<span>([^<]+)', webpage,
- 'upload date', fatal=False))
+ title = deliver_info['title']
+ thumbnail = jwconf.get('image')
+ duration = int_or_none(deliver_info.get('duration'))
+ timestamp = int_or_none(deliver_info.get('release_time'))
+ uploader_id = deliver_info.get('owner_id')
return {
'id': video_id,
'title': title,
- 'description': description,
'thumbnail': thumbnail,
- 'upload_date': upload_date,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader_id': uploader_id,
'formats': formats,
}
diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py
index f70978299..594bee4f9 100644
--- a/youtube_dl/extractor/unistra.py
+++ b/youtube_dl/extractor/unistra.py
@@ -38,7 +38,7 @@ class UnistraIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- files = set(re.findall(r'file\s*:\s*"([^"]+)"', webpage))
+ files = set(re.findall(r'file\s*:\s*"(/[^"]+)"', webpage))
quality = qualities(['SD', 'HD'])
formats = []
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 73b05ecab..b5fe753d7 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -47,7 +47,7 @@ class UstreamIE(InfoExtractor):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
- # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990)
+ # some sites use this embed format (see: https://github.com/rg3/youtube-dl/issues/2990)
if m.group('type') == 'embed/recorded':
video_id = m.group('id')
desktop_url = 'http://www.ustream.tv/recorded/' + video_id
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 87aca327b..5e2e7cbac 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -8,6 +8,7 @@ from ..utils import sanitized_Request
class VideoMegaIE(InfoExtractor):
+ _WORKING = False
_VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py
new file mode 100644
index 000000000..fcee940e6
--- /dev/null
+++ b/youtube_dl/extractor/videomore.py
@@ -0,0 +1,243 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_age_limit,
+ parse_iso8601,
+ xpath_text,
+)
+
+
+class VideomoreIE(InfoExtractor):
+ IE_NAME = 'videomore'
+ _VALID_URL = r'videomore:(?P<sid>\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P<id>\d+)(?:[/?#&]|\.(?:xml|json)|$)'
+ _TESTS = [{
+ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617',
+ 'md5': '70875fbf57a1cd004709920381587185',
+ 'info_dict': {
+ 'id': '367617',
+ 'ext': 'flv',
+ 'title': 'В гостях Алексей Чумаков и Юлия Ковальчук',
+ 'description': 'В гостях – лучшие романтические комедии года, «Выживший» Иньярриту и «Стив Джобс» Дэнни Бойла.',
+ 'series': 'Кино в деталях',
+ 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук',
+ 'episode_number': None,
+ 'season': 'Сезон 2015',
+ 'season_number': 5,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 2910,
+ 'age_limit': 16,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'http://videomore.ru/embed/259974',
+ 'info_dict': {
+ 'id': '259974',
+ 'ext': 'flv',
+ 'title': '80 серия',
+ 'description': '«Медведей» ждет решающий матч. Макеев выясняет отношения со Стрельцовым. Парни узнают подробности прошлого Макеева.',
+ 'series': 'Молодежка',
+ 'episode': '80 серия',
+ 'episode_number': 40,
+ 'season': '2 сезон',
+ 'season_number': 2,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 2809,
+ 'age_limit': 16,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://videomore.ru/molodezhka/sezon_promo/341073',
+ 'info_dict': {
+ 'id': '341073',
+ 'ext': 'flv',
+ 'title': 'Команда проиграла из-за Бакина?',
+ 'description': 'Молодежка 3 сезон скоро',
+ 'series': 'Молодежка',
+ 'episode': 'Команда проиграла из-за Бакина?',
+ 'episode_number': None,
+ 'season': 'Промо',
+ 'season_number': 99,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 29,
+ 'age_limit': 16,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://videomore.ru/elki_3?track_id=364623',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomore.ru/embed/364623',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomore.ru/video/tracks/364623.xml',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomore.ru/video/tracks/364623.json',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomore.ru/video/tracks/158031/quotes/33248',
+ 'only_matching': True,
+ }, {
+ 'url': 'videomore:367617',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<object[^>]+data=(["\'])https?://videomore.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('sid') or mobj.group('id')
+
+ video = self._download_xml(
+ 'http://videomore.ru/video/tracks/%s.xml' % video_id,
+ video_id, 'Downloading video XML')
+
+ video_url = xpath_text(video, './/video_url', 'video url', fatal=True)
+ formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds')
+
+ data = self._download_json(
+ 'http://videomore.ru/video/tracks/%s.json' % video_id,
+ video_id, 'Downloadinng video JSON')
+
+ title = data.get('title') or data['project_title']
+ description = data.get('description') or data.get('description_raw')
+ timestamp = parse_iso8601(data.get('published_at'))
+ duration = int_or_none(data.get('duration'))
+ view_count = int_or_none(data.get('views'))
+ age_limit = parse_age_limit(data.get('min_age'))
+ thumbnails = [{
+ 'url': thumbnail,
+ } for thumbnail in data.get('big_thumbnail_urls', [])]
+
+ series = data.get('project_title')
+ episode = data.get('title')
+ episode_number = int_or_none(data.get('episode_of_season') or None)
+ season = data.get('season_title')
+ season_number = int_or_none(data.get('season_pos') or None)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'series': series,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'season': season,
+ 'season_number': season_number,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
+
+
+class VideomoreVideoIE(InfoExtractor):
+ IE_NAME = 'videomore:video'
+ _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)[/?#&]*$'
+ _TESTS = [{
+ # single video with og:video:iframe
+ 'url': 'http://videomore.ru/elki_3',
+ 'info_dict': {
+ 'id': '364623',
+ 'ext': 'flv',
+ 'title': 'Ёлки 3',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 5579,
+ 'age_limit': 6,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # season single series with og:video:iframe
+ 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk',
+ 'only_matching': True,
+ }, {
+ # single video without og:video:iframe
+ 'url': 'http://videomore.ru/marin_i_ego_druzya',
+ 'info_dict': {
+ 'id': '359073',
+ 'ext': 'flv',
+ 'title': '1 серия. Здравствуй, Аквавилль!',
+ 'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 754,
+ 'age_limit': 6,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if VideomoreIE.suitable(url) else super(VideomoreVideoIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._og_search_property(
+ 'video:iframe', webpage, 'video url', default=None)
+
+ if not video_url:
+ video_id = self._search_regex(
+ (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml',
+ r'track-id=["\'](\d+)',
+ r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id')
+ video_url = 'videomore:%s' % video_id
+
+ return self.url_result(video_url, VideomoreIE.ie_key())
+
+
+class VideomoreSeasonIE(InfoExtractor):
+ IE_NAME = 'videomore:season'
+ _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)[/?#&]*$'
+ _TESTS = [{
+ 'url': 'http://videomore.ru/molodezhka/sezon_promo',
+ 'info_dict': {
+ 'id': 'molodezhka/sezon_promo',
+ 'title': 'Молодежка Промо',
+ },
+ 'playlist_mincount': 12,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+
+ entries = [
+ self.url_result(item) for item in re.findall(
+ r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
+ % display_id, webpage)]
+
+ return self.playlist_result(entries, display_id, title)
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index 591024ead..2cd36508a 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -11,6 +11,7 @@ from ..utils import (
class VideoTtIE(InfoExtractor):
+ _WORKING = False
ID_NAME = 'video.tt'
IE_DESC = 'video.tt - Your True Tube'
_VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 7af699982..76e681bc3 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -11,6 +11,7 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
+ determine_ext,
encode_dict,
ExtractorError,
InAdvancePagedList,
@@ -208,6 +209,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
'only_matching': True,
},
+ {
+ # source file returns 403: Forbidden
+ 'url': 'https://vimeo.com/7809605',
+ 'only_matching': True,
+ },
]
@staticmethod
@@ -217,7 +223,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
if mobj:
player_url = unescapeHTML(mobj.group('url'))
- surl = smuggle_url(player_url, {'Referer': url})
+ surl = smuggle_url(player_url, {'http_headers': {'Referer': url}})
return surl
# Look for embedded (swf embed) Vimeo player
mobj = re.search(
@@ -262,11 +268,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
self._login()
def _real_extract(self, url):
- url, data = unsmuggle_url(url)
+ url, data = unsmuggle_url(url, {})
headers = std_headers
- if data is not None:
+ if 'http_headers' in data:
headers = headers.copy()
- headers.update(data)
+ headers.update(data['http_headers'])
if 'Referer' not in headers:
headers['Referer'] = url
@@ -342,7 +348,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
- if data and '_video_password_verified' in data:
+ if '_video_password_verified' in data:
raise ExtractorError('video password verification failed!')
self._verify_video_password(url, video_id, webpage)
return self._real_extract(
@@ -354,6 +360,13 @@ class VimeoIE(VimeoBaseInfoExtractor):
if config.get('view') == 4:
config = self._verify_player_video_password(url, video_id)
+ if '>You rented this title.<' in webpage:
+ feature_id = config.get('video', {}).get('vod', {}).get('feature_id')
+ if feature_id and not data.get('force_feature_id', False):
+ return self.url_result(smuggle_url(
+ 'https://player.vimeo.com/player/%s' % feature_id,
+ {'force_feature_id': True}), 'Vimeo')
+
# Extract title
video_title = config["video"]["title"]
@@ -412,16 +425,21 @@ class VimeoIE(VimeoBaseInfoExtractor):
download_data = self._download_json(download_request, video_id, fatal=False)
if download_data:
source_file = download_data.get('source_file')
- if source_file and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
- formats.append({
- 'url': source_file['download_url'],
- 'ext': source_file['extension'].lower(),
- 'width': int_or_none(source_file.get('width')),
- 'height': int_or_none(source_file.get('height')),
- 'filesize': parse_filesize(source_file.get('size')),
- 'format_id': source_file.get('public_name', 'Original'),
- 'preference': 1,
- })
+ if isinstance(source_file, dict):
+ download_url = source_file.get('download_url')
+ if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
+ source_name = source_file.get('public_name', 'Original')
+ if self._is_valid_url(download_url, video_id, '%s video' % source_name):
+ ext = source_file.get('extension', determine_ext(download_url)).lower(),
+ formats.append({
+ 'url': download_url,
+ 'ext': ext,
+ 'width': int_or_none(source_file.get('width')),
+ 'height': int_or_none(source_file.get('height')),
+ 'filesize': parse_filesize(source_file.get('size')),
+ 'format_id': source_name,
+ 'preference': 1,
+ })
config_files = config['video'].get('files') or config['request'].get('files', {})
for f in config_files.get('progressive', []):
video_url = f.get('url')
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py
index 357594a11..a97995a6d 100644
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -5,12 +5,13 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
+ NO_DEFAULT,
sanitized_Request,
)
class VodlockerIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
+ _VALID_URL = r'https?://(?:www\.)?vodlocker\.(?:com|city)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
_TESTS = [{
'url': 'http://vodlocker.com/e8wvyzz4sl42',
@@ -43,16 +44,31 @@ class VodlockerIE(InfoExtractor):
webpage = self._download_webpage(
req, video_id, 'Downloading video page')
+ def extract_file_url(html, default=NO_DEFAULT):
+ return self._search_regex(
+ r'file:\s*"(http[^\"]+)",', html, 'file url', default=default)
+
+ video_url = extract_file_url(webpage, default=None)
+
+ if not video_url:
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?vodlocker\.(?:com|city)/embed-.+?)\1',
+ webpage, 'embed url', group='url')
+ embed_webpage = self._download_webpage(
+ embed_url, video_id, 'Downloading embed webpage')
+ video_url = extract_file_url(embed_webpage)
+ thumbnail_webpage = embed_webpage
+ else:
+ thumbnail_webpage = webpage
+
title = self._search_regex(
r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title')
thumbnail = self._search_regex(
- r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail')
- url = self._search_regex(
- r'file:\s*"(http[^\"]+)",', webpage, 'file url')
+ r'image:\s*"(http[^\"]+)",', thumbnail_webpage, 'thumbnail', fatal=False)
formats = [{
'format_id': 'sd',
- 'url': url,
+ 'url': video_url,
}]
return {
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
index bbd3bbf7b..01891ac4c 100644
--- a/youtube_dl/extractor/vrt.py
+++ b/youtube_dl/extractor/vrt.py
@@ -8,7 +8,7 @@ from ..utils import float_or_none
class VRTIE(InfoExtractor):
- _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*'
+ _VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*'
_TESTS = [
# deredactie.be
{
@@ -52,6 +52,10 @@ class VRTIE(InfoExtractor):
'duration': 661,
}
},
+ {
+ 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
@@ -69,11 +73,11 @@ class VRTIE(InfoExtractor):
if mobj:
formats.extend(self._extract_m3u8_formats(
'%s/%s' % (mobj.group('server'), mobj.group('path')),
- video_id, 'mp4'))
+ video_id, 'mp4', m3u8_id='hls'))
mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage)
if mobj:
formats.extend(self._extract_f4m_formats(
- '%s/manifest.f4m' % mobj.group('src'), video_id))
+ '%s/manifest.f4m' % mobj.group('src'), video_id, f4m_id='hds'))
self._sort_formats(formats)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index e8511398c..a851578e0 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -108,7 +108,9 @@ class WDRIE(InfoExtractor):
if mobj.group('player') is None:
entries = [
self.url_result(page_url + href, 'WDR')
- for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
+ for href in re.findall(
+ r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX,
+ webpage)
]
if entries: # Playlist page
@@ -133,8 +135,8 @@ class WDRIE(InfoExtractor):
note='Downloading playlist page %d' % page_num)
return self.playlist_result(entries, page_id)
- flashvars = compat_parse_qs(
- self._html_search_regex(r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+ flashvars = compat_parse_qs(self._html_search_regex(
+ r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
page_id = flashvars['trackerClipId'][0]
video_url = flashvars['dslSrc'][0]
@@ -148,7 +150,8 @@ class WDRIE(InfoExtractor):
if 'trackerClipAirTime' in flashvars:
upload_date = flashvars['trackerClipAirTime'][0]
else:
- upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
+ upload_date = self._html_search_meta(
+ 'DC.Date', webpage, 'upload date')
if upload_date:
upload_date = unified_strdate(upload_date)
@@ -157,12 +160,15 @@ class WDRIE(InfoExtractor):
preference = qualities(['S', 'M', 'L', 'XL'])
if video_url.endswith('.f4m'):
- formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id, f4m_id='hds', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id,
+ f4m_id='hds', fatal=False))
elif video_url.endswith('.smil'):
- formats.extend(self._extract_smil_formats(video_url, page_id, False, {
- 'hdcore': '3.3.0',
- 'plugin': 'aasp-3.3.0.99.43',
- }))
+ formats.extend(self._extract_smil_formats(
+ video_url, page_id, False, {
+ 'hdcore': '3.3.0',
+ 'plugin': 'aasp-3.3.0.99.43',
+ }))
else:
formats.append({
'url': video_url,
@@ -171,11 +177,16 @@ class WDRIE(InfoExtractor):
},
})
- m3u8_url = self._search_regex(r'rel="adaptiv"[^>]+href="([^"]+)"', webpage, 'm3u8 url', default=None)
+ m3u8_url = self._search_regex(
+ r'rel="adaptiv"[^>]+href="([^"]+)"',
+ webpage, 'm3u8 url', default=None)
if m3u8_url:
- formats.extend(self._extract_m3u8_formats(m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, page_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
- direct_urls = re.findall(r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage)
+ direct_urls = re.findall(
+ r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage)
if direct_urls:
for quality, video_url in direct_urls:
formats.append({
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 8938c0e45..fd43e8854 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -4,10 +4,9 @@ import re
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
- str_to_int,
+ float_or_none,
int_or_none,
- parse_duration,
+ unified_strdate,
)
@@ -22,7 +21,7 @@ class XHamsterIE(InfoExtractor):
'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014',
'uploader': 'Ruseful2011',
- 'duration': 893,
+ 'duration': 893.52,
'age_limit': 18,
}
},
@@ -34,7 +33,7 @@ class XHamsterIE(InfoExtractor):
'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914',
'uploader': 'jojo747400',
- 'duration': 200,
+ 'duration': 200.48,
'age_limit': 18,
}
},
@@ -64,20 +63,21 @@ class XHamsterIE(InfoExtractor):
webpage = self._download_webpage(mrss_url, video_id)
title = self._html_search_regex(
- [r'<title>(?P<title>.+?)(?:, (?:[^,]+? )?Porn: xHamster| - xHamster\.com)</title>',
- r'<h1>([^<]+)</h1>'], webpage, 'title')
+ [r'<h1[^>]*>([^<]+)</h1>',
+ r'<meta[^>]+itemprop=".*?caption.*?"[^>]+content="(.+?)"',
+ r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],
+ webpage, 'title')
# Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
description = mobj.group(1) if mobj else None
- upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
- webpage, 'upload date', fatal=False)
- if upload_date:
- upload_date = unified_strdate(upload_date)
+ upload_date = unified_strdate(self._search_regex(
+ r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}',
+ webpage, 'upload date', fatal=False))
uploader = self._html_search_regex(
- r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)",
+ r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+href=["\'].+?xhamster\.com/user/[^>]+>(?P<uploader>.+?)</a>',
webpage, 'uploader', default='anonymous')
thumbnail = self._search_regex(
@@ -85,12 +85,13 @@ class XHamsterIE(InfoExtractor):
r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''],
webpage, 'thumbnail', fatal=False, group='thumbnail')
- duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
- webpage, 'duration', fatal=False))
+ duration = float_or_none(self._search_regex(
+ r'(["\'])duration\1\s*:\s*(["\'])(?P<duration>.+?)\2',
+ webpage, 'duration', fatal=False, group='duration'))
- view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
- if view_count:
- view_count = str_to_int(view_count)
+ view_count = int_or_none(self._search_regex(
+ r'content=["\']User(?:View|Play)s:(\d+)',
+ webpage, 'view count', fatal=False))
mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
(like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index a1fe24050..8cd3a0687 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -1,10 +1,12 @@
from __future__ import unicode_literals
+import itertools
import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
+ int_or_none,
parse_duration,
sanitized_Request,
str_to_int,
@@ -12,7 +14,7 @@ from ..utils import (
class XTubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<id>[^/?&#]+))'
+ _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/watch\.php\?.*\bv=)(?P<id>[^/?&#]+)'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
@@ -30,7 +32,7 @@ class XTubeIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = sanitized_Request(url)
+ req = sanitized_Request('http://www.xtube.com/watch.php?v=%s' % video_id)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
@@ -88,45 +90,43 @@ class XTubeIE(InfoExtractor):
class XTubeUserIE(InfoExtractor):
IE_DESC = 'XTube user profile'
- _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
+ _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)'
_TEST = {
- 'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',
+ 'url': 'http://www.xtube.com/profile/greenshowers-4056496',
'info_dict': {
- 'id': 'greenshowers',
+ 'id': 'greenshowers-4056496',
'age_limit': 18,
},
'playlist_mincount': 155,
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- username = mobj.group('username')
-
- profile_page = self._download_webpage(
- url, username, note='Retrieving profile page')
-
- video_count = int(self._search_regex(
- r'<strong>%s\'s Videos \(([0-9]+)\)</strong>' % username, profile_page,
- 'video count'))
-
- PAGE_SIZE = 25
- urls = []
- page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
- for n in range(1, page_count + 1):
- lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
- lpage = self._download_webpage(
- lpage_url, username,
- note='Downloading page %d/%d' % (n, page_count))
- urls.extend(
- re.findall(r'addthis:url="([^"]+)"', lpage))
-
- return {
- '_type': 'playlist',
- 'id': username,
- 'age_limit': 18,
- 'entries': [{
- '_type': 'url',
- 'url': eurl,
- 'ie_key': 'XTube',
- } for eurl in urls]
- }
+ user_id = self._match_id(url)
+
+ entries = []
+ for pagenum in itertools.count(1):
+ request = sanitized_Request(
+ 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum),
+ headers={
+ 'Cookie': 'popunder=4',
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'Referer': url,
+ })
+
+ page = self._download_json(
+ request, user_id, 'Downloading videos JSON page %d' % pagenum)
+
+ html = page.get('html')
+ if not html:
+ break
+
+ for _, video_id in re.findall(r'data-plid=(["\'])(.+?)\1', html):
+ entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key()))
+
+ page_count = int_or_none(page.get('pageCount'))
+ if not page_count or pagenum == page_count:
+ break
+
+ playlist = self.playlist_result(entries, user_id)
+ playlist['age_limit'] = 18
+ return playlist
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index fca5ddc69..4a492f784 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -155,7 +155,16 @@ class YahooIE(InfoExtractor):
'description': 'md5:8fc39608213295748e1e289807838c97',
'duration': 1646,
},
- }
+ }, {
+ # it uses an alias to get the video_id
+ 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html',
+ 'info_dict': {
+ 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737',
+ 'ext': 'mp4',
+ 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking',
+ 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.',
+ },
+ },
]
def _real_extract(self, url):
@@ -199,13 +208,22 @@ class YahooIE(InfoExtractor):
r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
default=None)
if items_json is None:
- CONTENT_ID_REGEXES = [
- r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
- r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
- r'"first_videoid"\s*:\s*"([^"]+)"',
- r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
- ]
- video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
+ alias = self._search_regex(
+ r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None)
+ if alias is not None:
+ alias_info = self._download_json(
+ 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias,
+ display_id, 'Downloading alias info')
+ video_id = alias_info[0]['id']
+ else:
+ CONTENT_ID_REGEXES = [
+ r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
+ r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
+ r'"first_videoid"\s*:\s*"([^"]+)"',
+ r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
+ ]
+ video_id = self._search_regex(
+ CONTENT_ID_REGEXES, webpage, 'content ID')
else:
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 3a3432be8..f767fa15f 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -2,6 +2,9 @@
from __future__ import unicode_literals
import base64
+import random
+import string
+import time
from .common import InfoExtractor
from ..compat import (
@@ -141,6 +144,11 @@ class YoukuIE(InfoExtractor):
return video_urls_dict
+ @staticmethod
+ def get_ysuid():
+ return '%d%s' % (int(time.time()), ''.join([
+ random.choice(string.ascii_letters) for i in range(3)]))
+
def get_hd(self, fm):
hd_id_dict = {
'3gp': '0',
@@ -189,6 +197,8 @@ class YoukuIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ self._set_cookie('youku.com', '__ysuid', self.get_ysuid())
+
def retrieve_data(req_url, note):
headers = {
'Referer': req_url,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 4aac2cc03..d31161d21 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -613,7 +613,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
'params': {
'skip_download': 'requires avconv',
- }
+ },
+ 'skip': 'This live event has ended.',
},
# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
{
@@ -706,6 +707,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
{
# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
+ # Also tests cut-off URL expansion in video description (see
+ # https://github.com/rg3/youtube-dl/issues/1892,
+ # https://github.com/rg3/youtube-dl/issues/8164)
'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
'info_dict': {
'id': 'lsguqyKfVQg',
@@ -1235,10 +1239,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_description = re.sub(r'''(?x)
<a\s+
(?:[a-zA-Z-]+="[^"]+"\s+)*?
- title="([^"]+)"\s+
+ (?:title|href)="([^"]+)"\s+
(?:[a-zA-Z-]+="[^"]+"\s+)*?
- class="yt-uix-redirect-link"\s*>
- [^<]+
+ class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
+ [^<]+\.{3}\s*
</a>
''', r'\1', video_description)
video_description = clean_html(video_description)
@@ -1487,7 +1491,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if codecs:
codecs = codecs.split(',')
if len(codecs) == 2:
- acodec, vcodec = codecs[0], codecs[1]
+ acodec, vcodec = codecs[1], codecs[0]
else:
acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
dct.update({
@@ -1505,6 +1509,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for a_format in formats:
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
else:
+ unavailable_message = self._html_search_regex(
+ r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
+ video_webpage, 'unavailable message', default=None)
+ if unavailable_message:
+ raise ExtractorError(unavailable_message, expected=True)
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 2a1f2f6d1..c619a75e2 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -13,6 +13,7 @@ from ..utils import (
determine_ext,
qualities,
float_or_none,
+ ExtractorError,
)
@@ -59,7 +60,6 @@ class ZDFIE(InfoExtractor):
'ext': 'flv',
'format_id': '%s-%d' % (proto, bitrate),
'tbr': bitrate,
- 'protocol': proto,
})
self._sort_formats(formats)
return formats
@@ -70,6 +70,15 @@ class ZDFIE(InfoExtractor):
note='Downloading video info',
errnote='Failed to download video info')
+ status_code = doc.find('./status/statuscode')
+ if status_code is not None and status_code.text != 'ok':
+ code = status_code.text
+ if code == 'notVisibleAnymore':
+ message = 'Video %s is not available' % video_id
+ else:
+ message = '%s returned error: %s' % (self.IE_NAME, code)
+ raise ExtractorError(message, expected=True)
+
title = doc.find('.//information/title').text
description = xpath_text(doc, './/information/detail', 'description')
duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration'))
@@ -129,10 +138,10 @@ class ZDFIE(InfoExtractor):
video_url, video_id, fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
- video_url, video_id, f4m_id='hds', fatal=False))
+ video_url, video_id, f4m_id=format_id, fatal=False))
else:
proto = format_m.group('proto').lower()
diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py
new file mode 100644
index 000000000..de819376d
--- /dev/null
+++ b/youtube_dl/extractor/zippcast.py
@@ -0,0 +1,94 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ str_to_int,
+)
+
+
+class ZippCastIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [{
+ # m3u8, hq direct link
+ 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81',
+ 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6',
+ 'info_dict': {
+ 'id': 'c9cfd5c7e44dbc29c81',
+ 'ext': 'mp4',
+ 'title': '[Vinesauce] Vinny - Digital Space Traveler',
+ 'description': 'Muted on youtube, but now uploaded in it\'s original form.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'vinesauce',
+ 'view_count': int,
+ 'categories': ['Entertainment'],
+ 'tags': list,
+ },
+ }, {
+ # f4m, lq ipod direct link
+ 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.zippcast.com/video/%s' % video_id, video_id)
+
+ formats = []
+ video_url = self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage,
+ 'video url', default=None, group='url')
+ if video_url:
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'http',
+ 'preference': 0, # direct link is almost always of worse quality
+ })
+ src_url = self._search_regex(
+ r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1',
+ webpage, 'src', default=None, group='url')
+ ext = determine_ext(src_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src_url, video_id, f4m_id='hds', fatal=False))
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage)
+ uploader = self._search_regex(
+ r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>',
+ webpage, 'uploader', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+ view_count = str_to_int(self._search_regex(
+ r'>([\d,.]+) views!', webpage, 'view count', fatal=False))
+
+ categories = re.findall(
+ r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<',
+ webpage)
+ tags = re.findall(
+ r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<',
+ webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'categories': categories,
+ 'tags': tags,
+ 'formats': formats,
+ }