aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py25
-rw-r--r--youtube_dl/extractor/adobetv.py7
-rw-r--r--youtube_dl/extractor/adultswim.py2
-rw-r--r--youtube_dl/extractor/appletrailers.py12
-rw-r--r--youtube_dl/extractor/bandcamp.py20
-rw-r--r--youtube_dl/extractor/blinkx.py49
-rw-r--r--youtube_dl/extractor/brightcove.py3
-rw-r--r--youtube_dl/extractor/buzzfeed.py5
-rw-r--r--youtube_dl/extractor/cbs.py5
-rw-r--r--youtube_dl/extractor/cbssports.py30
-rw-r--r--youtube_dl/extractor/chirbit.py84
-rw-r--r--youtube_dl/extractor/common.py34
-rw-r--r--youtube_dl/extractor/dailymotion.py1
-rw-r--r--youtube_dl/extractor/defense.py5
-rw-r--r--youtube_dl/extractor/embedly.py16
-rw-r--r--youtube_dl/extractor/escapist.py11
-rw-r--r--youtube_dl/extractor/fivemin.py1
-rw-r--r--youtube_dl/extractor/gdcvault.py11
-rw-r--r--youtube_dl/extractor/generic.py37
-rw-r--r--youtube_dl/extractor/ign.py3
-rw-r--r--youtube_dl/extractor/imgur.py97
-rw-r--r--youtube_dl/extractor/livestream.py4
-rw-r--r--youtube_dl/extractor/nationalgeographic.py38
-rw-r--r--youtube_dl/extractor/nbc.py8
-rw-r--r--youtube_dl/extractor/netzkino.py3
-rw-r--r--youtube_dl/extractor/patreon.py39
-rw-r--r--youtube_dl/extractor/pornhub.py32
-rw-r--r--youtube_dl/extractor/r7.py88
-rw-r--r--youtube_dl/extractor/radiode.py15
-rw-r--r--youtube_dl/extractor/rtlnl.py63
-rw-r--r--youtube_dl/extractor/rtve.py9
-rw-r--r--youtube_dl/extractor/sandia.py117
-rw-r--r--youtube_dl/extractor/sockshare.py5
-rw-r--r--youtube_dl/extractor/soundgasm.py24
-rw-r--r--youtube_dl/extractor/teamcoco.py49
-rw-r--r--youtube_dl/extractor/ted.py27
-rw-r--r--youtube_dl/extractor/theonion.py17
-rw-r--r--youtube_dl/extractor/theplatform.py4
-rw-r--r--youtube_dl/extractor/tv4.py100
-rw-r--r--youtube_dl/extractor/twitch.py7
-rw-r--r--youtube_dl/extractor/videolecturesnet.py34
-rw-r--r--youtube_dl/extractor/vimeo.py19
-rw-r--r--youtube_dl/extractor/vk.py3
-rw-r--r--youtube_dl/extractor/webofstories.py22
-rw-r--r--youtube_dl/extractor/wsj.py4
-rw-r--r--youtube_dl/extractor/xtube.py2
-rw-r--r--youtube_dl/extractor/yahoo.py1
-rw-r--r--youtube_dl/extractor/yam.py81
-rw-r--r--youtube_dl/extractor/youtube.py16
-rw-r--r--youtube_dl/extractor/zapiks.py110
50 files changed, 1205 insertions, 194 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 13292073c..40fc92cf7 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -58,10 +58,15 @@ from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .cbsnews import CBSNewsIE
+from .cbssports import CBSSportsIE
from .ccc import CCCIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
+from .chirbit import (
+ ChirbitIE,
+ ChirbitProfileIE,
+)
from .cinchcast import CinchcastIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
@@ -121,6 +126,7 @@ from .ellentv import (
EllenTVClipsIE,
)
from .elpais import ElPaisIE
+from .embedly import EmbedlyIE
from .empflix import EMPFlixIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
@@ -204,6 +210,7 @@ from .imdb import (
ImdbIE,
ImdbListIE
)
+from .imgur import ImgurIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
@@ -282,6 +289,7 @@ from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .myvidster import MyVidsterIE
+from .nationalgeographic import NationalGeographicIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
@@ -350,13 +358,17 @@ from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
from .pornhd import PornHdIE
-from .pornhub import PornHubIE
+from .pornhub import (
+ PornHubIE,
+ PornHubPlaylistIE,
+)
from .pornotube import PornotubeIE
from .pornoxo import PornoXOIE
from .promptfile import PromptFileIE
from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
from .quickvid import QuickVidIE
+from .r7 import R7IE
from .radiode import RadioDeIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
@@ -371,7 +383,7 @@ from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtbf import RTBFIE
from .rte import RteIE
-from .rtlnl import RtlXlIE
+from .rtlnl import RtlNlIE
from .rtlnow import RTLnowIE
from .rtl2 import RTL2IE
from .rtp import RTPIE
@@ -386,6 +398,7 @@ from .rutube import (
RutubePersonIE,
)
from .rutv import RUTVIE
+from .sandia import SandiaIE
from .sapo import SapoIE
from .savefrom import SaveFromIE
from .sbs import SBSIE
@@ -416,7 +429,10 @@ from .soundcloud import (
SoundcloudUserIE,
SoundcloudPlaylistIE
)
-from .soundgasm import SoundgasmIE
+from .soundgasm import (
+ SoundgasmIE,
+ SoundgasmProfileIE
+)
from .southpark import (
SouthParkIE,
SouthparkDeIE,
@@ -482,6 +498,7 @@ from .tumblr import TumblrIE
from .tunein import TuneInIE
from .turbo import TurboIE
from .tutv import TutvIE
+from .tv4 import TV4IE
from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
@@ -579,6 +596,7 @@ from .yahoo import (
YahooIE,
YahooSearchIE,
)
+from .yam import YamIE
from .yesjapan import YesJapanIE
from .ynet import YnetIE
from .youjizz import YouJizzIE
@@ -602,6 +620,7 @@ from .youtube import (
YoutubeUserIE,
YoutubeWatchLaterIE,
)
+from .zapiks import ZapiksIE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import (
ZingMp3SongIE,
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
index 28e07f8b0..97d128560 100644
--- a/youtube_dl/extractor/adobetv.py
+++ b/youtube_dl/extractor/adobetv.py
@@ -28,7 +28,6 @@ class AdobeTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
webpage = self._download_webpage(url, video_id)
player = self._parse_json(
@@ -44,8 +43,10 @@ class AdobeTVIE(InfoExtractor):
self._html_search_meta('datepublished', webpage, 'upload date'))
duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration')
- or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration'))
+ self._html_search_meta('duration', webpage, 'duration') or
+ self._search_regex(
+ r'Runtime:\s*(\d{2}:\d{2}:\d{2})',
+ webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'<div class="views">\s*Views?:\s*([\d,.]+)\s*</div>',
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 502a9c25a..34b8b0115 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -38,6 +38,7 @@ class AdultSwimIE(InfoExtractor):
},
],
'info_dict': {
+ 'id': 'rQxZvXQ4ROaSOqq-or2Mow',
'title': 'Rick and Morty - Pilot',
'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
}
@@ -55,6 +56,7 @@ class AdultSwimIE(InfoExtractor):
}
],
'info_dict': {
+ 'id': '-t8CamQlQ2aYZ49ItZCFog',
'title': 'American Dad - Putting Francine Out of Business',
'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
},
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 287f71e07..576f03b5b 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -11,9 +11,12 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _TESTS = [{
"url": "http://trailers.apple.com/trailers/wb/manofsteel/",
+ 'info_dict': {
+ 'id': 'manofsteel',
+ },
"playlist": [
{
"md5": "d97a8e575432dbcb81b7c3acb741f8a8",
@@ -60,7 +63,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
]
- }
+ }, {
+ 'url': 'http://trailers.apple.com/ca/metropole/autrui/',
+ 'only_matching': True,
+ }]
_JSON_RE = r'iTunes.playURL\((.*?)\);'
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 490cc961a..869294967 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -109,7 +109,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+)|/?(?:$|[?#]))'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -133,31 +133,37 @@ class BandcampAlbumIE(InfoExtractor):
],
'info_dict': {
'title': 'Jazz Format Mixtape vol.1',
+ 'id': 'jazz-format-mixtape-vol-1',
+ 'uploader_id': 'blazo',
},
'params': {
'playlistend': 2
},
- 'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+ 'skip': 'Bandcamp imposes download limits.'
}, {
'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
'info_dict': {
'title': 'Hierophany of the Open Grave',
+ 'uploader_id': 'nightbringer',
+ 'id': 'hierophany-of-the-open-grave',
},
'playlist_mincount': 9,
}, {
'url': 'http://dotscale.bandcamp.com',
'info_dict': {
'title': 'Loom',
+ 'id': 'dotscale',
+ 'uploader_id': 'dotscale',
},
'playlist_mincount': 7,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('subdomain')
- title = mobj.group('title')
- display_id = title or playlist_id
- webpage = self._download_webpage(url, display_id)
+ uploader_id = mobj.group('subdomain')
+ album_id = mobj.group('album_id')
+ playlist_id = album_id or uploader_id
+ webpage = self._download_webpage(url, playlist_id)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
if not tracks_paths:
raise ExtractorError('The page doesn\'t contain any tracks')
@@ -168,8 +174,8 @@ class BandcampAlbumIE(InfoExtractor):
r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
return {
'_type': 'playlist',
+ 'uploader_id': uploader_id,
'id': playlist_id,
- 'display_id': display_id,
'title': title,
'entries': entries,
}
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
index 3e461e715..3b8eabe8f 100644
--- a/youtube_dl/extractor/blinkx.py
+++ b/youtube_dl/extractor/blinkx.py
@@ -1,40 +1,35 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
-from ..utils import remove_start
+from ..utils import (
+ remove_start,
+ int_or_none,
+)
class BlinkxIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
+ _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
IE_NAME = 'blinkx'
_TEST = {
- 'url': 'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB',
- 'md5': '2e9a07364af40163a908edbf10bb2492',
+ 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
+ 'md5': '337cf7a344663ec79bf93a526a2e06c7',
'info_dict': {
- 'id': '8aQUy7GV',
+ 'id': 'Da0Gw3xc',
'ext': 'mp4',
- 'title': 'Police Car Rolls Away',
- 'uploader': 'stupidvideos.com',
- 'upload_date': '20131215',
- 'timestamp': 1387068000,
- 'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!',
- 'duration': 14.886,
- 'thumbnails': [{
- 'width': 100,
- 'height': 76,
- 'resolution': '100x76',
- 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',
- }],
+ 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
+ 'uploader': 'IGN News',
+ 'upload_date': '20150217',
+ 'timestamp': 1424215740,
+ 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
+ 'duration': 47.743333,
},
}
- def _real_extract(self, rl):
- m = re.match(self._VALID_URL, rl)
- video_id = m.group('id')
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
display_id = video_id[:8]
api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' +
@@ -60,18 +55,20 @@ class BlinkxIE(InfoExtractor):
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
- tbr = (int(m['vbr']) + int(m['abr'])) // 1000
+ vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
+ abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
+ tbr = vbr + abr if vbr and abr else None
format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
'vcodec': vcodec,
'acodec': acodec,
- 'abr': int(m['abr']) // 1000,
- 'vbr': int(m['vbr']) // 1000,
+ 'abr': abr,
+ 'vbr': vbr,
'tbr': tbr,
- 'width': int(m['w']),
- 'height': int(m['h']),
+ 'width': int_or_none(m.get('w')),
+ 'height': int_or_none(m.get('h')),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index ea0969d4d..0733bece7 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -95,6 +95,7 @@ class BrightcoveIE(InfoExtractor):
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
'info_dict': {
'title': 'Sealife',
+ 'id': '3550319591001',
},
'playlist_mincount': 7,
},
@@ -247,7 +248,7 @@ class BrightcoveIE(InfoExtractor):
playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
- return self.playlist_result(videos, playlist_id=playlist_info['id'],
+ return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):
diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py
index a5d2af174..df503ecc0 100644
--- a/youtube_dl/extractor/buzzfeed.py
+++ b/youtube_dl/extractor/buzzfeed.py
@@ -33,6 +33,7 @@ class BuzzFeedIE(InfoExtractor):
'skip_download': True, # Got enough YouTube download tests
},
'info_dict': {
+ 'id': 'look-at-this-cute-dog-omg',
'description': 're:Munchkin the Teddy Bear is back ?!',
'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
},
@@ -42,8 +43,8 @@ class BuzzFeedIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20141124',
'uploader_id': 'CindysMunchkin',
- 'description': 're:© 2014 Munchkin the Shih Tzu',
- 'uploader': 'Munchkin the Shih Tzu',
+ 'description': 're:© 2014 Munchkin the',
+ 'uploader': 're:^Munchkin the',
'title': 're:Munchkin the Teddy Bear gets her exercise',
},
}]
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index e43756ec6..1ceb9d8d9 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -39,8 +37,7 @@ class CBSIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
real_id = self._search_regex(
r"video\.settings\.pid\s*=\s*'([^']+)';",
diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py
new file mode 100644
index 000000000..ae47e74cc
--- /dev/null
+++ b/youtube_dl/extractor/cbssports.py
@@ -0,0 +1,30 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class CBSSportsIE(InfoExtractor):
+ _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
+ 'info_dict': {
+ 'id': '_d5_GbO8p1sT',
+ 'ext': 'flv',
+ 'title': 'US Open flashbacks: 1990s',
+ 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ section = mobj.group('section')
+ video_id = mobj.group('id')
+ all_videos = self._download_json(
+ 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
+ video_id)
+ # The json file contains the info of all the videos in the section
+ video_info = next(v for v in all_videos if v['pcid'] == video_id)
+ return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')
diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py
new file mode 100644
index 000000000..b1eeaf101
--- /dev/null
+++ b/youtube_dl/extractor/chirbit.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+)
+
+
+class ChirbitIE(InfoExtractor):
+ IE_NAME = 'chirbit'
+ _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://chirb.it/PrIPv5',
+ 'md5': '9847b0dad6ac3e074568bf2cfb197de8',
+ 'info_dict': {
+ 'id': 'PrIPv5',
+ 'ext': 'mp3',
+ 'title': 'Фасадстрой',
+ 'duration': 52,
+ 'view_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://chirb.it/%s' % audio_id, audio_id)
+
+ audio_url = self._search_regex(
+ r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url')
+
+ title = self._search_regex(
+ r'itemprop="name">([^<]+)', webpage, 'title')
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._search_regex(
+ r'itemprop="playCount"\s*>(\d+)', webpage,
+ 'listen count', fatal=False))
+ comment_count = int_or_none(self._search_regex(
+ r'>(\d+) Comments?:', webpage,
+ 'comment count', fatal=False))
+
+ return {
+ 'id': audio_id,
+ 'url': audio_url,
+ 'title': title,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ }
+
+
+class ChirbitProfileIE(InfoExtractor):
+ IE_NAME = 'chirbit:profile'
+ _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://chirbit.com/ScarletBeauty',
+ 'info_dict': {
+ 'id': 'ScarletBeauty',
+ 'title': 'Chirbits by ScarletBeauty',
+ },
+ 'playlist_mincount': 3,
+ }
+
+ def _real_extract(self, url):
+ profile_id = self._match_id(url)
+
+ rss = self._download_xml(
+ 'http://chirbit.com/rss/%s' % profile_id, profile_id)
+
+ entries = [
+ self.url_result(audio_url.text, 'Chirbit')
+ for audio_url in rss.findall('./channel/item/link')]
+
+ title = rss.find('./channel/title').text
+
+ return self.playlist_result(entries, profile_id, title)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 7d8ce1808..87fce9cd8 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -27,7 +27,6 @@ from ..utils import (
compiled_regex_type,
ExtractorError,
float_or_none,
- HEADRequest,
int_or_none,
RegexNotFoundError,
sanitize_filename,
@@ -398,6 +397,16 @@ class InfoExtractor(object):
if blocked_iframe:
msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
+ if '<title>The URL you requested has been blocked</title>' in content[:512]:
+ msg = (
+ 'Access to this webpage has been blocked by Indian censorship. '
+ 'Use a VPN or proxy server (with --proxy) to route around it.')
+ block_msg = self._html_search_regex(
+ r'</h1><p>(.*?)</p>',
+ content, 'block message', default=None)
+ if block_msg:
+ msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+ raise ExtractorError(msg, expected=True)
return content
@@ -735,6 +744,7 @@ class InfoExtractor(object):
f.get('language_preference') if f.get('language_preference') is not None else -1,
f.get('quality') if f.get('quality') is not None else -1,
f.get('tbr') if f.get('tbr') is not None else -1,
+ f.get('filesize') if f.get('filesize') is not None else -1,
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
@@ -742,7 +752,6 @@ class InfoExtractor(object):
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
f.get('fps') if f.get('fps') is not None else -1,
- f.get('filesize') if f.get('filesize') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
f.get('source_preference') if f.get('source_preference') is not None else -1,
f.get('format_id'),
@@ -759,9 +768,7 @@ class InfoExtractor(object):
def _is_valid_url(self, url, video_id, item='video'):
try:
- self._request_webpage(
- HEADRequest(url), video_id,
- 'Checking %s URL' % item)
+ self._request_webpage(url, video_id, 'Checking %s URL' % item)
return True
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
@@ -807,8 +814,8 @@ class InfoExtractor(object):
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
- manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
- + (media_el.attrib.get('href') or media_el.attrib.get('url')))
+ manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
+ (media_el.attrib.get('href') or media_el.attrib.get('url')))
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
@@ -832,7 +839,7 @@ class InfoExtractor(object):
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
- 'preference': -1,
+ 'preference': preference - 1 if preference else -1,
'resolution': 'multiple',
'format_note': 'Quality selection URL',
}]
@@ -847,6 +854,7 @@ class InfoExtractor(object):
note='Downloading m3u8 information',
errnote='Failed to download m3u8 information')
last_info = None
+ last_media = None
kv_rex = re.compile(
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
for line in m3u8_doc.splitlines():
@@ -857,6 +865,13 @@ class InfoExtractor(object):
if v.startswith('"'):
v = v[1:-1]
last_info[m.group('key')] = v
+ elif line.startswith('#EXT-X-MEDIA:'):
+ last_media = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_media[m.group('key')] = v
elif line.startswith('#') or not line.strip():
continue
else:
@@ -885,6 +900,9 @@ class InfoExtractor(object):
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
+ if last_media is not None:
+ f['m3u8_media'] = last_media
+ last_media = None
formats.append(f)
last_info = {}
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 4ca892926..42b20a46d 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -190,6 +190,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
'info_dict': {
'title': 'SPORT',
+ 'id': 'xv4bw_nqtv_sport',
},
'playlist_mincount': 20,
}]
diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py
index 2b90bf4fc..98e3aedfd 100644
--- a/youtube_dl/extractor/defense.py
+++ b/youtube_dl/extractor/defense.py
@@ -25,8 +25,9 @@ class DefenseGouvFrIE(InfoExtractor):
r"flashvars.pvg_id=\"(\d+)\";",
webpage, 'ID')
- json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/'
- + video_id)
+ json_url = (
+ 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' %
+ video_id)
info = self._download_json(json_url, title, 'Downloading JSON config')
video_url = info['renditions'][0]['url']
diff --git a/youtube_dl/extractor/embedly.py b/youtube_dl/extractor/embedly.py
new file mode 100644
index 000000000..1cdb11e34
--- /dev/null
+++ b/youtube_dl/extractor/embedly.py
@@ -0,0 +1,16 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class EmbedlyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)'
+ _TESTS = [{
+ 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 4303feccd..b49b9869f 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -22,6 +22,7 @@ class EscapistIE(InfoExtractor):
'uploader_id': 'the-escapist-presents',
'uploader': 'The Escapist Presents',
'title': "Breaking Down Baldur's Gate",
+ 'thumbnail': 're:^https?://.*\.jpg$',
}
}
@@ -30,19 +31,18 @@ class EscapistIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
uploader_id = self._html_search_regex(
- r"<h1 class='headline'><a href='/videos/view/(.*?)'",
+ r"<h1\s+class='headline'>\s*<a\s+href='/videos/view/(.*?)'",
webpage, 'uploader ID', fatal=False)
uploader = self._html_search_regex(
- r"<h1 class='headline'>(.*?)</a>",
+ r"<h1\s+class='headline'>(.*?)</a>",
webpage, 'uploader', fatal=False)
description = self._html_search_meta('description', webpage)
raw_title = self._html_search_meta('title', webpage, fatal=True)
title = raw_title.partition(' : ')[2]
- player_url = self._og_search_video_url(webpage, name='player URL')
- config_url = compat_urllib_parse.unquote(self._search_regex(
- r'config=(.*)$', player_url, 'config URL'))
+ config_url = compat_urllib_parse.unquote(self._html_search_regex(
+ r'<param\s+name="flashvars"\s+value="config=([^"&]+)', webpage, 'config URL'))
formats = []
@@ -81,5 +81,4 @@ class EscapistIE(InfoExtractor):
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': description,
- 'player_url': player_url,
}
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 5b24b921c..157094e8c 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -14,6 +14,7 @@ class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
(?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
+ https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
5min:)
(?P<id>\d+)
'''
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index fed968f51..f7b467b0a 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -7,6 +7,7 @@ from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import remove_end
class GDCVaultIE(InfoExtractor):
@@ -65,10 +66,12 @@ class GDCVaultIE(InfoExtractor):
def _parse_flv(self, xml_description):
video_formats = []
- akami_url = xml_description.find('./metadata/akamaiHost').text
+ akamai_url = xml_description.find('./metadata/akamaiHost').text
slide_video_path = xml_description.find('./metadata/slideVideo').text
video_formats.append({
- 'url': 'rtmp://' + akami_url + '/' + slide_video_path,
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(slide_video_path, '.flv'),
+ 'ext': 'flv',
'format_note': 'slide deck video',
'quality': -2,
'preference': -2,
@@ -76,7 +79,9 @@ class GDCVaultIE(InfoExtractor):
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
video_formats.append({
- 'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(speaker_video_path, '.flv'),
+ 'ext': 'flv',
'format_note': 'speaker video',
'quality': -1,
'preference': -1,
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index f4500e931..875e1bf05 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -473,6 +473,7 @@ class GenericIE(InfoExtractor):
{
'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
'info_dict': {
+ 'id': '1986',
'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
},
'playlist_mincount': 2,
@@ -531,13 +532,31 @@ class GenericIE(InfoExtractor):
'info_dict': {
'id': 'Mrj4DVp2zeA',
'ext': 'mp4',
- 'upload_date': '20150204',
+ 'upload_date': '20150212',
'uploader': 'The National Archives UK',
'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
'uploader_id': 'NationalArchives08',
'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
},
- }
+ },
+ # rtl.nl embed
+ {
+ 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'aanslagen-kopenhagen',
+ 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
+ }
+ },
+ # Zapiks embed
+ {
+ 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
+ 'info_dict': {
+ 'id': '118046',
+ 'ext': 'mp4',
+ 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
+ }
+ },
]
def report_following_redirect(self, new_url):
@@ -782,6 +801,13 @@ class GenericIE(InfoExtractor):
'entries': entries,
}
+ # Look for embedded rtl.nl player
+ matches = re.findall(
+ r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
+ webpage)
+ if matches:
+ return _playlist_from_matches(matches, ie='RtlNl')
+
# Look for embedded (iframe) Vimeo player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
@@ -789,7 +815,6 @@ class GenericIE(InfoExtractor):
player_url = unescapeHTML(mobj.group('url'))
surl = smuggle_url(player_url, {'Referer': url})
return self.url_result(surl)
-
# Look for embedded (swf embed) Vimeo player
mobj = re.search(
r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
@@ -1082,6 +1107,12 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'Livestream')
+ # Look for Zapiks embed
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Zapiks')
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 3db668cd0..3aade9e74 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -34,6 +34,9 @@ class IGNIE(InfoExtractor):
},
{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
+ 'info_dict': {
+ 'id': '100-little-things-in-gta-5-that-will-blow-your-mind',
+ },
'playlist': [
{
'info_dict': {
diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py
new file mode 100644
index 000000000..fe5d95e2c
--- /dev/null
+++ b/youtube_dl/extractor/imgur.py
@@ -0,0 +1,97 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ mimetype2ext,
+ ExtractorError,
+)
+
+
+class ImgurIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+
+ _TESTS = [{
+ 'url': 'https://i.imgur.com/A61SaA1.gifv',
+ 'info_dict': {
+ 'id': 'A61SaA1',
+ 'ext': 'mp4',
+ 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
+ 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$',
+ },
+ }, {
+ 'url': 'https://imgur.com/A61SaA1',
+ 'info_dict': {
+ 'id': 'A61SaA1',
+ 'ext': 'mp4',
+ 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
+ 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ width = int_or_none(self._search_regex(
+ r'<param name="width" value="([0-9]+)"',
+ webpage, 'width', fatal=False))
+ height = int_or_none(self._search_regex(
+ r'<param name="height" value="([0-9]+)"',
+ webpage, 'height', fatal=False))
+
+ video_elements = self._search_regex(
+ r'(?s)<div class="video-elements">(.*?)</div>',
+ webpage, 'video elements', default=None)
+ if not video_elements:
+ raise ExtractorError(
+ 'No sources found for video %s. Maybe an image?' % video_id,
+ expected=True)
+
+ formats = []
+ for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
+ formats.append({
+ 'format_id': m.group('type').partition('/')[2],
+ 'url': self._proto_relative_url(m.group('src')),
+ 'ext': mimetype2ext(m.group('type')),
+ 'acodec': 'none',
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'User-Agent': 'youtube-dl (like wget)',
+ },
+ })
+
+ gif_json = self._search_regex(
+ r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
+ webpage, 'GIF code', fatal=False)
+ if gif_json:
+ gifd = self._parse_json(
+ gif_json, video_id, transform_source=js_to_json)
+ formats.append({
+ 'format_id': 'gif',
+ 'preference': -10,
+ 'width': width,
+ 'height': height,
+ 'ext': 'gif',
+ 'acodec': 'none',
+ 'vcodec': 'gif',
+ 'container': 'gif',
+ 'url': self._proto_relative_url(gifd['gifUrl']),
+ 'filesize': gifd.get('size'),
+ 'http_headers': {
+ 'User-Agent': 'youtube-dl (like wget)',
+ },
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ 'title': self._og_search_title(webpage),
+ }
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 5247c6f58..3642089f7 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -37,6 +37,7 @@ class LivestreamIE(InfoExtractor):
'url': 'http://new.livestream.com/tedx/cityenglish',
'info_dict': {
'title': 'TEDCity2.0 (English)',
+ 'id': '2245590',
},
'playlist_mincount': 4,
}, {
@@ -148,7 +149,8 @@ class LivestreamIE(InfoExtractor):
if is_relevant(video_data, video_id)]
if video_id is None:
# This is an event page:
- return self.playlist_result(videos, info['id'], info['full_name'])
+ return self.playlist_result(
+ videos, '%s' % info['id'], info['full_name'])
else:
if not videos:
raise ExtractorError('Cannot find video %s' % video_id)
diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py
new file mode 100644
index 000000000..c18640c5a
--- /dev/null
+++ b/youtube_dl/extractor/nationalgeographic.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ smuggle_url,
+ url_basename,
+)
+
+
+class NationalGeographicIE(InfoExtractor):
+ _VALID_URL = r'http://video\.nationalgeographic\.com/video/.*?'
+
+ _TEST = {
+ 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
+ 'info_dict': {
+ 'id': '4DmDACA6Qtk_',
+ 'ext': 'flv',
+ 'title': 'Mating Crabs Busted by Sharks',
+ 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+ },
+ 'add_ie': ['ThePlatform'],
+ }
+
+ def _real_extract(self, url):
+ name = url_basename(url)
+
+ webpage = self._download_webpage(url, name)
+ feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url')
+ guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid')
+
+ feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
+ content = feed.find('.//{http://search.yahoo.com/mrss/}content')
+ theplatform_id = url_basename(content.attrib.get('url'))
+
+ return self.url_result(smuggle_url(
+ 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
+ # For some reason, the normal links don't work and we must force the use of f4m
+ {'force_smil_url': True}))
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 89a2845fe..3645d3033 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -18,13 +18,13 @@ class NBCIE(InfoExtractor):
_TESTS = [
{
- 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+ 'url': 'http://www.nbc.com/the-tonight-show/segments/112966',
# md5 checksum is not stable
'info_dict': {
- 'id': 'bTmnLCvIbaaH',
+ 'id': 'c9xnCo0YPOPH',
'ext': 'flv',
- 'title': 'I Am a Firefighter',
- 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
+ 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
},
},
{
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py
index 93567d1e3..bc17e20aa 100644
--- a/youtube_dl/extractor/netzkino.py
+++ b/youtube_dl/extractor/netzkino.py
@@ -29,6 +29,9 @@ class NetzkinoIE(InfoExtractor):
'timestamp': 1344858571,
'age_limit': 12,
},
+ 'params': {
+ 'skip_download': 'Download only works from Germany',
+ }
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index 5429592a7..f179ea200 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -1,9 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
from ..utils import (
js_to_json,
@@ -11,7 +8,7 @@ from ..utils import (
class PatreonIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)'
+ _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P<id>[^&#]+)'
_TESTS = [
{
'url': 'http://www.patreon.com/creation?hid=743933',
@@ -35,6 +32,23 @@ class PatreonIE(InfoExtractor):
'thumbnail': 're:^https?://.*$',
},
},
+ {
+ 'url': 'https://www.patreon.com/creation?hid=1682498',
+ 'info_dict': {
+ 'id': 'SU4fj_aEMVw',
+ 'ext': 'mp4',
+ 'title': 'I\'m on Patreon!',
+ 'uploader': 'TraciJHines',
+ 'thumbnail': 're:^https?://.*$',
+ 'upload_date': '20150211',
+ 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4',
+ 'uploader_id': 'TraciJHines',
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': True,
+ }
+ }
]
# Currently Patreon exposes download URL via hidden CSS, so login is not
@@ -65,26 +79,29 @@ class PatreonIE(InfoExtractor):
'''
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage).strip()
attach_fn = self._html_search_regex(
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
+ embed = self._html_search_regex(
+ r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+ webpage, 'embedded URL', default=None)
+
if attach_fn is not None:
video_url = 'http://www.patreon.com' + attach_fn
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
r'<strong>(.*?)</strong> is creating', webpage, 'uploader')
+ elif embed is not None:
+ return self.url_result(embed)
else:
- playlist_js = self._search_regex(
+ playlist = self._parse_json(self._search_regex(
r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])',
- webpage, 'playlist JSON')
- playlist_json = js_to_json(playlist_js)
- playlist = json.loads(playlist_json)
+ webpage, 'playlist JSON'),
+ video_id, transform_source=js_to_json)
data = playlist[0]
video_url = self._proto_relative_url(data['mp3'])
thumbnail = self._proto_relative_url(data.get('cover'))
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index fb2032832..3a27e3789 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -56,7 +56,7 @@ class PornHubIE(InfoExtractor):
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
- r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<',
+ r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
@@ -110,3 +110,33 @@ class PornHubIE(InfoExtractor):
'formats': formats,
'age_limit': 18,
}
+
+
+class PornHubPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.pornhub.com/playlist/6201671',
+ 'info_dict': {
+ 'id': '6201671',
+ 'title': 'P0p4',
+ },
+ 'playlist_mincount': 35,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
+ for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
+ ]
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
+ playlist_id)
+
+ return self.playlist_result(
+ entries, playlist_id, playlist.get('title'), playlist.get('description'))
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
new file mode 100644
index 000000000..976c8feec
--- /dev/null
+++ b/youtube_dl/extractor/r7.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ unescapeHTML,
+ int_or_none,
+)
+
+
+class R7IE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://
+ (?:
+ (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
+ noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
+ player\.r7\.com/video/i/
+ )
+ (?P<id>[\da-f]{24})
+ '''
+ _TESTS = [{
+ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
+ 'md5': '403c4e393617e8e8ddc748978ee8efde',
+ 'info_dict': {
+ 'id': '54e7050b0cf2ff57e0279389',
+ 'ext': 'mp4',
+ 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 98,
+ 'like_count': int,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://player.r7.com/video/i/%s' % video_id, video_id)
+
+ item = self._parse_json(js_to_json(self._search_regex(
+ r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
+
+ title = unescapeHTML(item['title'])
+ thumbnail = item.get('init', {}).get('thumbUri')
+ duration = None
+
+ statistics = item.get('statistics', {})
+ like_count = int_or_none(statistics.get('likes'))
+ view_count = int_or_none(statistics.get('views'))
+
+ formats = []
+ for format_key, format_dict in item['playlist'][0].items():
+ src = format_dict.get('src')
+ if not src:
+ continue
+ format_id = format_dict.get('format') or format_key
+ if duration is None:
+ duration = format_dict.get('duration')
+ if '.f4m' in src:
+ formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
+ elif src.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
+ else:
+ formats.append({
+ 'url': src,
+ 'format_id': format_id,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'like_count': like_count,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py
index f95bc9454..aa5f6f8ad 100644
--- a/youtube_dl/extractor/radiode.py
+++ b/youtube_dl/extractor/radiode.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import json
-
from .common import InfoExtractor
@@ -10,13 +8,13 @@ class RadioDeIE(InfoExtractor):
_VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)'
_TEST = {
'url': 'http://ndr2.radio.de/',
- 'md5': '3b4cdd011bc59174596b6145cda474a4',
'info_dict': {
'id': 'ndr2',
'ext': 'mp3',
'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:591c49c702db1a33751625ebfb67f273',
'thumbnail': 're:^https?://.*\.png',
+ 'is_live': True,
},
'params': {
'skip_download': True,
@@ -25,16 +23,15 @@ class RadioDeIE(InfoExtractor):
def _real_extract(self, url):
radio_id = self._match_id(url)
-
webpage = self._download_webpage(url, radio_id)
+ jscode = self._search_regex(
+ r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n",
+ webpage, 'broadcast')
- broadcast = json.loads(self._search_regex(
- r'_getBroadcast\s*=\s*function\(\s*\)\s*{\s*return\s+({.+?})\s*;\s*}',
- webpage, 'broadcast'))
-
+ broadcast = self._parse_json(jscode, radio_id)
title = self._live_title(broadcast['name'])
description = broadcast.get('description') or broadcast.get('shortDescription')
- thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl')
+ thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100')
formats = [{
'url': stream['streamUrl'],
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index a3ca79f2c..cfce4550a 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -1,16 +1,25 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import parse_duration
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
-class RtlXlIE(InfoExtractor):
- IE_NAME = 'rtlxl.nl'
- _VALID_URL = r'https?://(www\.)?rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+class RtlNlIE(InfoExtractor):
+ IE_NAME = 'rtl.nl'
+ IE_DESC = 'rtl.nl and rtlxl.nl'
+ _VALID_URL = r'''(?x)
+ https?://(www\.)?
+ (?:
+ rtlxl\.nl/\#!/[^/]+/|
+ rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid=
+ )
+ (?P<id>[0-9a-f-]+)'''
- _TEST = {
+ _TESTS = [{
'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
'md5': 'cc16baa36a6c169391f0764fa6b16654',
'info_dict': {
@@ -22,21 +31,30 @@ class RtlXlIE(InfoExtractor):
'upload_date': '20140814',
'duration': 576.880,
},
- }
+ }, {
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
+ 'md5': 'dea7474214af1271d91ef332fb8be7ea',
+ 'info_dict': {
+ 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed',
+ 'ext': 'mp4',
+ 'timestamp': 1424039400,
+ 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
+ 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
+ 'upload_date': '20150215',
+ 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
+ }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- uuid = mobj.group('uuid')
-
+ uuid = self._match_id(url)
info = self._download_json(
'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
uuid)
material = info['material'][0]
- episode_info = info['episodes'][0]
-
progname = info['abstracts'][0]['name']
subtitle = material['title'] or info['episodes'][0]['name']
+ description = material.get('synopsis') or info['episodes'][0]['synopsis']
# Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118)
videopath = material['videopath'].replace('.f4m', '.m3u8')
@@ -58,14 +76,29 @@ class RtlXlIE(InfoExtractor):
'quality': 0,
}
])
-
self._sort_formats(formats)
+ thumbnails = []
+ meta = info.get('meta', {})
+ for p in ('poster_base_url', '"thumb_base_url"'):
+ if not meta.get(p):
+ continue
+
+ thumbnails.append({
+ 'url': self._proto_relative_url(meta[p] + uuid),
+ 'width': int_or_none(self._search_regex(
+ r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)),
+ 'height': int_or_none(self._search_regex(
+ r'/sz=[0-9]+x([0-9]+)',
+ meta[p], 'thumbnail height', fatal=False))
+ })
+
return {
'id': uuid,
'title': '%s - %s' % (progname, subtitle),
'formats': formats,
'timestamp': material['original_date'],
- 'description': episode_info['synopsis'],
+ 'description': description,
'duration': parse_duration(material.get('duration')),
+ 'thumbnails': thumbnails,
}
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 3469d9578..e60f85b5b 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -6,6 +6,7 @@ import re
import time
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
struct_unpack,
remove_end,
@@ -96,12 +97,10 @@ class RTVEALaCartaIE(InfoExtractor):
).replace('.net.rtve', '.multimedia.cdn.rtve')
video_path = self._download_webpage(
auth_url, video_id, 'Getting video url')
- # Use mvod.akcdn instead of flash.akamaihd.multimedia.cdn to get
+ # Use mvod1.akcdn instead of flash.akamaihd.multimedia.cdn to get
# the right Content-Length header and the mp4 format
- video_url = (
- 'http://mvod.akcdn.rtve.es/{0}&v=2.6.8'
- '&fp=MAC%2016,0,0,296&r=MRUGG&g=OEOJWFXNFGCP'.format(video_path)
- )
+ video_url = compat_urlparse.urljoin(
+ 'http://mvod1.akcdn.rtve.es/', video_path)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py
new file mode 100644
index 000000000..9c88167f0
--- /dev/null
+++ b/youtube_dl/extractor/sandia.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+ compat_urlparse,
+)
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ mimetype2ext,
+ unified_strdate,
+)
+
+
+class SandiaIE(InfoExtractor):
+ IE_DESC = 'Sandia National Laboratories'
+ _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P<id>[0-9a-f]+)'
+ _TEST = {
+ 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d',
+ 'md5': '9422edc9b9a60151727e4b6d8bef393d',
+ 'info_dict': {
+ 'id': '24aace4429fc450fb5b38cdbf424a66e1d',
+ 'ext': 'mp4',
+ 'title': 'Xyce Software Training - Section 1',
+ 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
+ 'upload_date': '20120904',
+ 'duration': 7794,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ req = compat_urllib_request.Request(url)
+ req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
+ webpage = self._download_webpage(req, video_id)
+
+ js_path = self._search_regex(
+ r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
+ webpage, 'JS code URL')
+ js_url = compat_urlparse.urljoin(url, js_path)
+
+ js_code = self._download_webpage(
+ js_url, video_id, note='Downloading player')
+
+ def extract_str(key, **args):
+ return self._search_regex(
+ r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
+ js_code, key, **args)
+
+ def extract_data(key, **args):
+ data_json = extract_str(key, **args)
+ if data_json is None:
+ return data_json
+ return self._parse_json(
+ data_json, video_id, transform_source=js_to_json)
+
+ formats = []
+ for i in itertools.count():
+ fd = extract_data('VideoUrls[%d]' % i, default=None)
+ if fd is None:
+ break
+ formats.append({
+ 'format_id': '%s' % i,
+ 'format_note': fd['MimeType'].partition('/')[2],
+ 'ext': mimetype2ext(fd['MimeType']),
+ 'url': fd['Location'],
+ 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
+ })
+ self._sort_formats(formats)
+
+ slide_baseurl = compat_urlparse.urljoin(
+ url, extract_data('SlideBaseUrl'))
+ slide_template = slide_baseurl + re.sub(
+ r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
+ slides = []
+ last_slide_time = 0
+ for i in itertools.count(1):
+ sd = extract_str('Slides[%d]' % i, default=None)
+ if sd is None:
+ break
+ timestamp = int_or_none(self._search_regex(
+ r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
+ sd, 'slide %s timestamp' % i, fatal=False))
+ slides.append({
+ 'url': slide_template % i,
+ 'duration': timestamp - last_slide_time,
+ })
+ last_slide_time = timestamp
+ formats.append({
+ 'format_id': 'slides',
+ 'protocol': 'slideshow',
+ 'url': json.dumps(slides),
+ 'preference': -10000, # Downloader not yet written
+ })
+ self._sort_formats(formats)
+
+ title = extract_data('Title')
+ description = extract_data('Description', fatal=False)
+ duration = int_or_none(extract_data(
+ 'Duration', fatal=False), scale=1000)
+ upload_date = unified_strdate(extract_data('AirDate', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
index 7d3c0e937..b5fa6f1da 100644
--- a/youtube_dl/extractor/sockshare.py
+++ b/youtube_dl/extractor/sockshare.py
@@ -25,7 +25,6 @@ class SockshareIE(InfoExtractor):
'id': '437BE28B89D799D7',
'title': 'big_buck_bunny_720p_surround.avi',
'ext': 'avi',
- 'thumbnail': 're:^http://.*\.jpg$',
}
}
@@ -45,7 +44,7 @@ class SockshareIE(InfoExtractor):
''', webpage, 'hash')
fields = {
- "hash": confirm_hash,
+ "hash": confirm_hash.encode('utf-8'),
"confirm": "Continue as Free User"
}
@@ -68,7 +67,7 @@ class SockshareIE(InfoExtractor):
webpage, 'title', default=None)
thumbnail = self._html_search_regex(
r'<img\s+src="([^"]*)".+?name="bg"',
- webpage, 'thumbnail')
+ webpage, 'thumbnail', default=None)
formats = [{
'format_id': 'sd',
diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py
index a4f8ce6c3..3a4ddf57e 100644
--- a/youtube_dl/extractor/soundgasm.py
+++ b/youtube_dl/extractor/soundgasm.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
class SoundgasmIE(InfoExtractor):
+ IE_NAME = 'soundgasm'
_VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)'
_TEST = {
'url': 'http://soundgasm.net/u/ytdl/Piano-sample',
@@ -38,3 +39,26 @@ class SoundgasmIE(InfoExtractor):
'title': audio_title,
'description': description
}
+
+
+class SoundgasmProfileIE(InfoExtractor):
+ IE_NAME = 'soundgasm:profile'
+ _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$'
+ _TEST = {
+ 'url': 'http://soundgasm.net/u/ytdl',
+ 'info_dict': {
+ 'id': 'ytdl',
+ },
+ 'playlist_count': 1,
+ }
+
+ def _real_extract(self, url):
+ profile_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, profile_id)
+
+ entries = [
+ self.url_result(audio_url, 'Soundgasm')
+ for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)]
+
+ return self.playlist_result(entries, profile_id)
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index a73da1c9c..5793dbc10 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -1,8 +1,10 @@
from __future__ import unicode_literals
+import base64
import re
from .common import InfoExtractor
+from ..utils import qualities
class TeamcocoIE(InfoExtractor):
@@ -24,8 +26,8 @@ class TeamcocoIE(InfoExtractor):
'info_dict': {
'id': '19705',
'ext': 'mp4',
- "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.",
- "title": "Louis C.K. Interview Pt. 1 11/3/11",
+ 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.',
+ 'title': 'Louis C.K. Interview Pt. 1 11/3/11',
'age_limit': 0,
}
}
@@ -42,42 +44,39 @@ class TeamcocoIE(InfoExtractor):
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
- video_id = mobj.group("video_id")
+ video_id = mobj.group('video_id')
if not video_id:
video_id = self._html_search_regex(
self._VIDEO_ID_REGEXES, webpage, 'video id')
- data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
- data = self._download_xml(
- data_url, display_id, 'Downloading data webpage')
+ embed_url = 'http://teamcoco.com/embed/v/%s' % video_id
+ embed = self._download_webpage(
+ embed_url, video_id, 'Downloading embed page')
+
+ encoded_data = self._search_regex(
+ r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data')
+ data = self._parse_json(
+ base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id)
- qualities = ['500k', '480p', '1000k', '720p', '1080p']
formats = []
- for filed in data.findall('files/file'):
- if filed.attrib.get('playmode') == 'all':
- # it just duplicates one of the entries
- break
- file_url = filed.text
- m_format = re.search(r'(\d+(k|p))\.mp4', file_url)
+ get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
+ for filed in data['files']:
+ m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
if m_format is not None:
format_id = m_format.group(1)
else:
- format_id = filed.attrib['bitrate']
+ format_id = filed['bitrate']
tbr = (
- int(filed.attrib['bitrate'])
- if filed.attrib['bitrate'].isdigit()
+ int(filed['bitrate'])
+ if filed['bitrate'].isdigit()
else None)
- try:
- quality = qualities.index(format_id)
- except ValueError:
- quality = -1
formats.append({
- 'url': file_url,
+ 'url': filed['url'],
'ext': 'mp4',
'tbr': tbr,
'format_id': format_id,
- 'quality': quality,
+ 'quality': get_quality(format_id),
})
self._sort_formats(formats)
@@ -86,8 +85,8 @@ class TeamcocoIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'formats': formats,
- 'title': self._og_search_title(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'description': self._og_search_description(webpage),
+ 'title': data['title'],
+ 'thumbnail': data.get('thumb', {}).get('href'),
+ 'description': data.get('teaser'),
'age_limit': self._family_friendly_search(webpage),
}
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 0c38c8f89..4cec06f8b 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -83,6 +83,22 @@ class TEDIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # YouTube video
+ 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
+ 'add_ie': ['Youtube'],
+ 'info_dict': {
+ 'id': 'aFBIPO-P7LM',
+ 'ext': 'mp4',
+ 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
+ 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
+ 'uploader': 'TEDx Talks',
+ 'uploader_id': 'TEDxTalks',
+ 'upload_date': '20111216',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
_NATIVE_FORMATS = {
@@ -132,11 +148,16 @@ class TEDIE(InfoExtractor):
talk_info = self._extract_info(webpage)['talks'][0]
- if talk_info.get('external') is not None:
- self.to_screen('Found video from %s' % talk_info['external']['service'])
+ external = talk_info.get('external')
+ if external:
+ service = external['service']
+ self.to_screen('Found video from %s' % service)
+ ext_url = None
+ if service.lower() == 'youtube':
+ ext_url = external.get('code')
return {
'_type': 'url',
- 'url': talk_info['external']['uri'],
+ 'url': ext_url or external['uri'],
}
formats = [{
diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py
index b65d8e03f..10239c906 100644
--- a/youtube_dl/extractor/theonion.py
+++ b/youtube_dl/extractor/theonion.py
@@ -4,11 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
class TheOnionIE(InfoExtractor):
- _VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?'
+ _VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?'
_TEST = {
'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
'md5': '19eaa9a39cf9b9804d982e654dc791ee',
@@ -22,10 +21,8 @@ class TheOnionIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- article_id = mobj.group('article_id')
-
- webpage = self._download_webpage(url, article_id)
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'"videoId":\s(\d+),', webpage, 'video ID')
@@ -34,10 +31,6 @@ class TheOnionIE(InfoExtractor):
thumbnail = self._og_search_thumbnail(webpage)
sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
- if not sources:
- raise ExtractorError(
- 'No sources found for video %s' % video_id, expected=True)
-
formats = []
for src, type_ in sources:
if type_ == 'video/mp4':
@@ -54,15 +47,15 @@ class TheOnionIE(InfoExtractor):
})
elif type_ == 'application/x-mpegURL':
formats.extend(
- self._extract_m3u8_formats(src, video_id, preference=-1))
+ self._extract_m3u8_formats(src, display_id, preference=-1))
else:
self.report_warning(
'Encountered unexpected format: %s' % type_)
-
self._sort_formats(formats)
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 5f24189cc..feac666f7 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -71,7 +71,9 @@ class ThePlatformIE(InfoExtractor):
if not provider_id:
provider_id = 'dJ5BDC'
- if mobj.group('config'):
+ if smuggled_data.get('force_smil_url', False):
+ smil_url = url
+ elif mobj.group('config'):
config_url = url + '&form=json'
config_url = config_url.replace('swf/', 'config/')
config_url = config_url.replace('onsite/', 'onsite/config/')
diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py
new file mode 100644
index 000000000..1c4b6d635
--- /dev/null
+++ b/youtube_dl/extractor/tv4.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+)
+
+
+class TV4IE(InfoExtractor):
+ IE_DESC = 'tv4.se and tv4play.se'
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
+ (?:
+ tv4\.se/(?:[^/]+)/klipp/(?:.*)-|
+ tv4play\.se/
+ (?:
+ (?:program|barn)/(?:[^\?]+)\?video_id=|
+ iframe/video/|
+ film/|
+ sport/|
+ )
+ )(?P<id>[0-9]+)'''
+ _TESTS = [
+ {
+ 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650',
+ 'md5': '909d6454b87b10a25aa04c4bdd416a9b',
+ 'info_dict': {
+ 'id': '2491650',
+ 'ext': 'mp4',
+ 'title': 'Kalla Fakta 5 (english subtitles)',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': int,
+ 'upload_date': '20131125',
+ },
+ },
+ {
+ 'url': 'http://www.tv4play.se/iframe/video/3054113',
+ 'md5': '77f851c55139ffe0ebd41b6a5552489b',
+ 'info_dict': {
+ 'id': '3054113',
+ 'ext': 'mp4',
+ 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.',
+ 'timestamp': int,
+ 'upload_date': '20150130',
+ },
+ },
+ {
+ 'url': 'http://www.tv4play.se/sport/3060959',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.tv4play.se/film/2378136',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._download_json(
+ 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON')
+
+ # If is_geo_restricted is true, it doesn't neceserally mean we can't download it
+ if info['is_geo_restricted']:
+ self.report_warning('This content might not be available in your country due to licensing restrictions.')
+ if info['requires_subscription']:
+ raise ExtractorError('This content requires subscription.', expected=True)
+
+ sources_data = self._download_json(
+ 'https://prima.tv4play.se/api/web/asset/%s/play.json?protocol=http&videoFormat=MP4' % video_id, video_id, 'Downloading sources JSON')
+ sources = sources_data['playback']
+
+ formats = []
+ for item in sources.get('items', {}).get('item', []):
+ ext, bitrate = item['mediaFormat'], item['bitrate']
+ formats.append({
+ 'format_id': '%s_%s' % (ext, bitrate),
+ 'tbr': bitrate,
+ 'ext': ext,
+ 'url': item['url'],
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'formats': formats,
+ 'description': info.get('description'),
+ 'timestamp': parse_iso8601(info.get('broadcast_date_time')),
+ 'duration': info.get('duration'),
+ 'thumbnail': info.get('image'),
+ 'is_live': sources.get('live'),
+ }
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 87290d002..4b0d8988d 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -349,6 +349,13 @@ class TwitchStreamIE(TwitchBaseIE):
% (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
channel_id, 'mp4')
+ # prefer the 'source' stream, the others are limited to 30 fps
+ def _sort_source(f):
+ if f.get('m3u8_media') is not None and f['m3u8_media'].get('NAME') == 'Source':
+ return 1
+ return 0
+ formats = sorted(formats, key=_sort_source)
+
view_count = stream.get('viewers')
timestamp = parse_iso8601(stream.get('created_at'))
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
index ebd2a3dca..d6a7eb203 100644
--- a/youtube_dl/extractor/videolecturesnet.py
+++ b/youtube_dl/extractor/videolecturesnet.py
@@ -49,15 +49,31 @@ class VideoLecturesNetIE(InfoExtractor):
thumbnail = (
None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
- formats = [{
- 'url': v.attrib['src'],
- 'width': int_or_none(v.attrib.get('width')),
- 'height': int_or_none(v.attrib.get('height')),
- 'filesize': int_or_none(v.attrib.get('size')),
- 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
- 'ext': v.attrib.get('ext'),
- } for v in switch.findall('./video')
- if v.attrib.get('proto') == 'http']
+ formats = []
+ for v in switch.findall('./video'):
+ proto = v.attrib.get('proto')
+ if proto not in ['http', 'rtmp']:
+ continue
+ f = {
+ 'width': int_or_none(v.attrib.get('width')),
+ 'height': int_or_none(v.attrib.get('height')),
+ 'filesize': int_or_none(v.attrib.get('size')),
+ 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
+ 'ext': v.attrib.get('ext'),
+ }
+ src = v.attrib['src']
+ if proto == 'http':
+ if self._is_valid_url(src, video_id):
+ f['url'] = src
+ formats.append(f)
+ elif proto == 'rtmp':
+ f.update({
+ 'url': v.attrib['streamer'],
+ 'play_path': src,
+ 'rtmp_real_time': True,
+ })
+ formats.append(f)
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 5930d5984..8f540f578 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import json
import re
import itertools
+import hashlib
from .common import InfoExtractor
from ..compat import (
@@ -17,6 +18,7 @@ from ..utils import (
InAdvancePagedList,
int_or_none,
RegexNotFoundError,
+ smuggle_url,
std_headers,
unsmuggle_url,
urlencode_postdata,
@@ -173,7 +175,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option')
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
data = compat_urllib_parse.urlencode({
'password': password,
@@ -223,6 +225,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
if mobj.group('pro') or mobj.group('player'):
url = 'http://player.vimeo.com/video/' + video_id
+ password = self._downloader.params.get('videopassword', None)
+ if password:
+ headers['Cookie'] = '%s_password=%s' % (
+ video_id, hashlib.md5(password.encode('utf-8')).hexdigest())
+
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, headers)
try:
@@ -266,8 +273,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
+ if data and '_video_password_verified' in data:
+ raise ExtractorError('video password verification failed!')
self._verify_video_password(url, video_id, webpage)
- return self._real_extract(url)
+ return self._real_extract(
+ smuggle_url(url, {'_video_password_verified': 'verified'}))
else:
raise ExtractorError('Unable to extract info section',
cause=e)
@@ -398,6 +408,7 @@ class VimeoChannelIE(InfoExtractor):
_TESTS = [{
'url': 'http://vimeo.com/channels/tributes',
'info_dict': {
+ 'id': 'tributes',
'title': 'Vimeo Tributes',
},
'playlist_mincount': 25,
@@ -476,6 +487,7 @@ class VimeoUserIE(VimeoChannelIE):
'url': 'http://vimeo.com/nkistudio/videos',
'info_dict': {
'title': 'Nki',
+ 'id': 'nkistudio',
},
'playlist_mincount': 66,
}]
@@ -493,6 +505,7 @@ class VimeoAlbumIE(VimeoChannelIE):
_TESTS = [{
'url': 'http://vimeo.com/album/2632481',
'info_dict': {
+ 'id': '2632481',
'title': 'Staff Favorites: November 2013',
},
'playlist_mincount': 13,
@@ -523,6 +536,7 @@ class VimeoGroupsIE(VimeoAlbumIE):
_TESTS = [{
'url': 'http://vimeo.com/groups/rolexawards',
'info_dict': {
+ 'id': 'rolexawards',
'title': 'Rolex Awards for Enterprise',
},
'playlist_mincount': 73,
@@ -605,6 +619,7 @@ class VimeoLikesIE(InfoExtractor):
'url': 'https://vimeo.com/user755559/likes/',
'playlist_mincount': 293,
"info_dict": {
+ 'id': 'user755559_likes',
"description": "See all the videos urza likes",
"title": 'Videos urza likes',
},
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 81e02a624..7dea8c59d 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -217,6 +217,9 @@ class VKUserVideosIE(InfoExtractor):
_TEMPLATE_URL = 'https://vk.com/videos'
_TEST = {
'url': 'http://vk.com/videos205387401',
+ 'info_dict': {
+ 'id': '205387401',
+ },
'playlist_mincount': 4,
}
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
index 396cf4e83..73077a312 100644
--- a/youtube_dl/extractor/webofstories.py
+++ b/youtube_dl/extractor/webofstories.py
@@ -45,19 +45,17 @@ class WebOfStoriesIE(InfoExtractor):
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
- story_filename = self._search_regex(
- r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
- speaker_id = self._search_regex(
- r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
- story_id = self._search_regex(
- r'\.storyId\((\d+)\)', webpage, 'story ID')
- speaker_type = self._search_regex(
- r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
- great_life = self._search_regex(
- r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
+ embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
+ r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
+ webpage, 'embed params').split(',')]
+
+ (
+ _, speaker_id, story_id, story_duration,
+ speaker_type, great_life, _thumbnail, _has_subtitles,
+ story_filename, _story_order) = embed_params
+
is_great_life_series = great_life == 'true'
- duration = int_or_none(self._search_regex(
- r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
+ duration = int_or_none(story_duration)
# URL building, see: http://www.webofstories.com/scripts/player.js
ms_prefix = ''
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index cbe3dc7be..2ddf29a69 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -18,8 +18,8 @@ class WSJIE(InfoExtractor):
'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'ext': 'mp4',
'upload_date': '20150202',
- 'uploader_id': 'bbright',
- 'creator': 'bbright',
+ 'uploader_id': 'jdesai',
+ 'creator': 'jdesai',
'categories': list, # a long list
'duration': 90,
'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index e8490b028..1644f53c8 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -22,7 +22,7 @@ class XTubeIE(InfoExtractor):
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
- 'description': 'http://www.xtube.com an ET kind of thing',
+ 'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index f8e7041a0..97dbac4cc 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -24,7 +24,6 @@ class YahooIE(InfoExtractor):
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- 'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py
new file mode 100644
index 000000000..b294767c5
--- /dev/null
+++ b/youtube_dl/extractor/yam.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ float_or_none,
+ month_by_abbreviation,
+)
+
+
+class YamIE(InfoExtractor):
+ _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)'
+
+ _TESTS = [{
+ # An audio hosted on Yam
+ 'url': 'http://mymedia.yam.com/m/2283921',
+ 'md5': 'c011b8e262a52d5473d9c2e3c9963b9c',
+ 'info_dict': {
+ 'id': '2283921',
+ 'ext': 'mp3',
+ 'title': '發現 - 趙薇 京華煙雲主題曲',
+ 'uploader_id': 'princekt',
+ 'upload_date': '20080807',
+ 'duration': 313.0,
+ }
+ }, {
+ # An external video hosted on YouTube
+ 'url': 'http://mymedia.yam.com/m/3598173',
+ 'md5': '0238ceec479c654e8c2f1223755bf3e9',
+ 'info_dict': {
+ 'id': 'pJ2Deys283c',
+ 'ext': 'mp4',
+ 'upload_date': '20150202',
+ 'uploader': '新莊社大瑜伽社',
+ 'description': 'md5:f5cc72f0baf259a70fb731654b0d2eff',
+ 'uploader_id': '2323agoy',
+ 'title': '外婆的澎湖灣KTV-潘安邦',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ page = self._download_webpage(url, video_id)
+
+ # Is it hosted externally on YouTube?
+ youtube_url = self._html_search_regex(
+ r'<embed src="(http://www.youtube.com/[^"]+)"',
+ page, 'YouTube url', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
+
+ api_page = self._download_webpage(
+ 'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
+ note='Downloading API page')
+ api_result_obj = compat_urlparse.parse_qs(api_page)
+
+ uploader_id = self._html_search_regex(
+ r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z]+)"',
+ page, 'uploader id', fatal=False)
+ mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2}) ' +
+ r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
+ if mobj:
+ upload_date = '%s%02d%02d' % (
+ mobj.group('year'),
+ month_by_abbreviation(mobj.group('mon')),
+ int(mobj.group('day')))
+ else:
+ upload_date = None
+ duration = float_or_none(api_result_obj['totaltime'][0], scale=1000)
+
+ return {
+ 'id': video_id,
+ 'url': api_result_obj['mp3file'][0],
+ 'title': self._html_search_meta('description', page),
+ 'duration': duration,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 1b2dbf276..22db896b1 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -540,26 +540,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
+ download_note = (
+ 'Downloading player %s' % player_url
+ if self._downloader.params.get('verbose') else
+ 'Downloading %s player %s' % (player_type, player_id)
+ )
if player_type == 'js':
code = self._download_webpage(
player_url, video_id,
- note='Downloading %s player %s' % (player_type, player_id),
+ note=download_note,
errnote='Download of %s failed' % player_url)
res = self._parse_sig_js(code)
elif player_type == 'swf':
urlh = self._request_webpage(
player_url, video_id,
- note='Downloading %s player %s' % (player_type, player_id),
+ note=download_note,
errnote='Download of %s failed' % player_url)
code = urlh.read()
res = self._parse_sig_swf(code)
else:
assert False, 'Invalid player type %r' % player_type
- if cache_spec is None:
- test_string = ''.join(map(compat_chr, range(len(example_sig))))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
return res
diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py
new file mode 100644
index 000000000..22a9a57e8
--- /dev/null
+++ b/youtube_dl/extractor/zapiks.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_iso8601,
+ xpath_with_ns,
+ xpath_text,
+ int_or_none,
+)
+
+
+class ZapiksIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))'
+ _TESTS = [
+ {
+ 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html',
+ 'md5': 'aeb3c473b2d564b2d46d664d28d5f050',
+ 'info_dict': {
+ 'id': '80798',
+ 'ext': 'mp4',
+ 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!',
+ 'description': 'md5:7054d6f6f620c6519be1fe710d4da847',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 528,
+ 'timestamp': 1359044972,
+ 'upload_date': '20130124',
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ },
+ {
+ 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&amp;media_id=118046&amp;width=640&amp;height=360&amp;autoStart=false&amp;language=fr',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ if not video_id:
+ video_id = self._search_regex(
+ r'data-media-id="(\d+)"', webpage, 'video id')
+
+ playlist = self._download_xml(
+ 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id,
+ display_id)
+
+ NS_MAP = {
+ 'jwplayer': 'http://rss.jwpcdn.com/'
+ }
+
+ def ns(path):
+ return xpath_with_ns(path, NS_MAP)
+
+ item = playlist.find('./channel/item')
+
+ title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = xpath_text(
+ item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None)
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', default=None))
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'uploadDate', webpage, 'upload date', default=None), ' ')
+
+ view_count = int_or_none(self._search_regex(
+ r'UserPlays:(\d+)', webpage, 'view count', default=None))
+ comment_count = int_or_none(self._search_regex(
+ r'UserComments:(\d+)', webpage, 'comment count', default=None))
+
+ formats = []
+ for source in item.findall(ns('./jwplayer:source')):
+ format_id = source.attrib['label']
+ f = {
+ 'url': source.attrib['file'],
+ 'format_id': format_id,
+ }
+ m = re.search(r'^(?P<height>\d+)[pP]', format_id)
+ if m:
+ f['height'] = int(m.group('height'))
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ }