aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py5
-rw-r--r--youtube_dl/extractor/bbc.py45
-rw-r--r--youtube_dl/extractor/clipfish.py67
-rw-r--r--youtube_dl/extractor/common.py2
-rw-r--r--youtube_dl/extractor/dailymotion.py10
-rw-r--r--youtube_dl/extractor/dcn.py84
-rw-r--r--youtube_dl/extractor/facebook.py6
-rw-r--r--youtube_dl/extractor/generic.py17
-rw-r--r--youtube_dl/extractor/lynda.py5
-rw-r--r--youtube_dl/extractor/mdr.py2
-rw-r--r--youtube_dl/extractor/nowtv.py71
-rw-r--r--youtube_dl/extractor/periscope.py99
-rw-r--r--youtube_dl/extractor/pornhub.py4
-rw-r--r--youtube_dl/extractor/screenwavemedia.py90
-rw-r--r--youtube_dl/extractor/southpark.py8
-rw-r--r--youtube_dl/extractor/tudou.py9
-rw-r--r--youtube_dl/extractor/twitch.py12
-rw-r--r--youtube_dl/extractor/xhamster.py35
18 files changed, 421 insertions, 150 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 7e5c90829..e38e77a27 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -118,6 +118,7 @@ from .dailymotion import (
)
from .daum import DaumIE
from .dbtv import DBTVIE
+from .dcn import DCNIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
from .dfb import DFBIE
@@ -431,6 +432,10 @@ from .orf import (
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
+from .periscope import (
+ PeriscopeIE,
+ QuickscopeIE,
+)
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 9a1b6e3dc..abc5a44a1 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -527,6 +527,18 @@ class BBCIE(BBCCoUkIE):
'skip_download': True,
}
}, {
+ # single video from video playlist embedded with vxp-playlist-data JSON
+ 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
+ 'info_dict': {
+ 'id': 'p02w6qjc',
+ 'ext': 'mp4',
+ 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+ 'duration': 56,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
# single video story with digitalData
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': {
@@ -695,13 +707,36 @@ class BBCIE(BBCCoUkIE):
if not medias:
# Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
- media_asset_page = self._parse_json(
+ media_asset = self._search_regex(
+ r'mediaAssetPage\.init\(\s*({.+?}), "/',
+ webpage, 'media asset', default=None)
+ if media_asset:
+ media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
+ medias = []
+ for video in media_asset_page.get('videos', {}).values():
+ medias.extend(video.values())
+
+ if not medias:
+ # Multiple video playlist with single `now playing` entry (e.g.
+ # http://www.bbc.com/news/video_and_audio/must_see/33767813)
+ vxp_playlist = self._parse_json(
self._search_regex(
- r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'),
+ r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
+ webpage, 'playlist data'),
playlist_id)
- medias = []
- for video in media_asset_page.get('videos', {}).values():
- medias.extend(video.values())
+ playlist_medias = []
+ for item in vxp_playlist:
+ media = item.get('media')
+ if not media:
+ continue
+ playlist_medias.append(media)
+ # Download single video if found media with asset id matching the video id from URL
+ if item.get('advert', {}).get('assetId') == playlist_id:
+ medias = [media]
+ break
+ # Fallback to the whole playlist
+ if not medias:
+ medias = playlist_medias
entries = []
for num, media_meta in enumerate(medias, start=1):
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index a5c3cb7c6..7af903571 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,53 +1,68 @@
from __future__ import unicode_literals
import re
-import time
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
- parse_duration,
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ parse_iso8601,
+ remove_end,
)
class ClipfishIE(InfoExtractor):
- IE_NAME = 'clipfish'
-
- _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
- 'md5': '2521cd644e862936cf2e698206e47385',
+ 'md5': '79bc922f3e8a9097b3d68a93780fd475',
'info_dict': {
'id': '3966754',
'ext': 'mp4',
'title': 'FIFA 14 - E3 2013 Trailer',
+ 'timestamp': 1370938118,
+ 'upload_date': '20130611',
'duration': 82,
- },
- 'skip': 'Blocked in the US'
+ }
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
- info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
- (video_id, int(time.time())))
- doc = self._download_xml(
- info_url, video_id, note='Downloading info page')
- title = doc.find('title').text
- video_url = doc.find('filename').text
- if video_url is None:
- xml_bytes = xml.etree.ElementTree.tostring(doc)
- raise ExtractorError('Cannot find video URL in document %r' %
- xml_bytes)
- thumbnail = doc.find('imageurl').text
- duration = parse_duration(doc.find('duration').text)
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_info = self._parse_json(
+ js_to_json(self._html_search_regex(
+ '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')),
+ video_id)
+
+ formats = []
+ for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage):
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.append({
+ 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
+ 'ext': 'mp4',
+ 'format_id': 'hls',
+ })
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': ext,
+ })
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' - Video')
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(video_info.get('length'))
+ timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date'))
return {
'id': video_id,
'title': title,
- 'url': video_url,
+ 'formats': formats,
'thumbnail': thumbnail,
'duration': duration,
+ 'timestamp': timestamp,
}
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 717dcec7b..def6caa0d 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -638,7 +638,7 @@ class InfoExtractor(object):
@staticmethod
def _meta_regex(prop):
return r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
def _og_search_property(self, prop, html, name=None, **kargs):
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 85d945509..2d90b2224 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -15,7 +15,6 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
- orderedSet,
parse_iso8601,
str_to_int,
unescapeHTML,
@@ -278,7 +277,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
}]
def _extract_entries(self, id):
- video_ids = []
+ video_ids = set()
processed_urls = set()
for pagenum in itertools.count(1):
page_url = self._PAGE_TEMPLATE % (id, pagenum)
@@ -291,12 +290,13 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
processed_urls.add(urlh.geturl())
- video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
+ for video_id in re.findall(r'data-xid="(.+?)"', webpage):
+ if video_id not in video_ids:
+ yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+ video_ids.add(video_id)
if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
- return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
- for video_id in orderedSet(video_ids)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py
new file mode 100644
index 000000000..82261e25c
--- /dev/null
+++ b/youtube_dl/extractor/dcn.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class DCNIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887',
+ 'info_dict':
+ {
+ 'id': '17375',
+ 'ext': 'mp4',
+ 'title': 'رحلة العمر : الحلقة 1',
+ 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 2041,
+ 'timestamp': 1227504126,
+ 'upload_date': '20081124',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ request = compat_urllib_request.Request(
+ 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
+ headers={'Origin': 'http://www.dcndigital.ae'})
+
+ video = self._download_json(request, video_id)
+ title = video.get('title_en') or video['title_ar']
+
+ webpage = self._download_webpage(
+ 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?'
+ + compat_urllib_parse.urlencode({
+ 'id': video['id'],
+ 'user_id': video['user_id'],
+ 'signature': video['signature'],
+ 'countries': 'Q0M=',
+ 'filter': 'DENY',
+ }), video_id)
+
+ m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+
+ rtsp_url = self._search_regex(
+ r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
+ if rtsp_url:
+ formats.append({
+ 'url': rtsp_url,
+ 'format_id': 'rtsp',
+ })
+
+ self._sort_formats(formats)
+
+ img = video.get('img')
+ thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None
+ duration = int_or_none(video.get('duration'))
+ description = video.get('description_en') or video.get('description_ar')
+ timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index e17bb9aea..178a7ca4c 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -17,6 +17,8 @@ from ..utils import (
int_or_none,
limit_length,
urlencode_postdata,
+ get_element_by_id,
+ clean_html,
)
@@ -42,6 +44,7 @@ class FacebookIE(InfoExtractor):
'id': '637842556329505',
'ext': 'mp4',
'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
+ 'uploader': 'Tennis on Facebook',
}
}, {
'note': 'Video without discernible title',
@@ -50,6 +53,7 @@ class FacebookIE(InfoExtractor):
'id': '274175099429670',
'ext': 'mp4',
'title': 'Facebook video #274175099429670',
+ 'uploader': 'Asif Nawab Butt',
},
'expected_warnings': [
'title'
@@ -161,6 +165,7 @@ class FacebookIE(InfoExtractor):
video_title = limit_length(video_title, 80)
if not video_title:
video_title = 'Facebook video #%s' % video_id
+ uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
return {
'id': video_id,
@@ -168,4 +173,5 @@ class FacebookIE(InfoExtractor):
'formats': formats,
'duration': int_or_none(video_data.get('video_duration')),
'thumbnail': video_data.get('thumbnail_src'),
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 27584c44c..901f77304 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -304,6 +304,19 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
+ {
+ # ooyala video embedded with http://player.ooyala.com/iframe.js
+ 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
+ 'info_dict': {
+ 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
+ 'ext': 'mp4',
+ 'title': '"Steve Jobs: Man in the Machine" trailer',
+ 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# multiple ooyala embeds on SBN network websites
{
'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@@ -1390,7 +1403,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for Ooyala videos
- mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
@@ -1725,7 +1738,7 @@ class GenericIE(InfoExtractor):
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
found = filter_video(re.findall(
- r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
+ r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
if not found:
# Flow player
found = filter_video(re.findall(r'''(?xs)
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index a00f6e5e5..deead220a 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -17,7 +17,6 @@ from ..utils import (
class LyndaBaseIE(InfoExtractor):
_LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
- _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
_NETRC_MACHINE = 'lynda'
@@ -41,7 +40,7 @@ class LyndaBaseIE(InfoExtractor):
request, None, 'Logging in as %s' % username)
# Not (yet) logged in
- m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
+ m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page)
if m is not None:
response = m.group('json')
response_json = json.loads(response)
@@ -70,7 +69,7 @@ class LyndaBaseIE(InfoExtractor):
request, None,
'Confirming log in and log out from another device')
- if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
+ if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
raise ExtractorError('Unable to log in')
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 5fdd19027..fc7499958 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -29,7 +29,7 @@ class MDRIE(InfoExtractor):
doc = self._download_xml(domain + xmlurl, video_id)
formats = []
for a in doc.findall('./assets/asset'):
- url_el = a.find('.//progressiveDownloadUrl')
+ url_el = a.find('./progressiveDownloadUrl')
if url_el is None:
continue
abr = int(a.find('bitrateAudio').text) // 1000
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
index 0b5ff4760..66c627bec 100644
--- a/youtube_dl/extractor/nowtv.py
+++ b/youtube_dl/extractor/nowtv.py
@@ -1,12 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
+ determine_ext,
int_or_none,
parse_iso8601,
parse_duration,
@@ -15,7 +14,7 @@ from ..utils import (
class NowTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+ _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)'
_TESTS = [{
# rtl
@@ -23,7 +22,7 @@ class NowTVIE(InfoExtractor):
'info_dict': {
'id': '203519',
'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Die neuen Bauern und eine Hochzeit',
'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -32,7 +31,7 @@ class NowTVIE(InfoExtractor):
'duration': 2786,
},
'params': {
- # m3u8 download
+ # rtmp download
'skip_download': True,
},
}, {
@@ -41,7 +40,7 @@ class NowTVIE(InfoExtractor):
'info_dict': {
'id': '203481',
'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Berlin - Tag & Nacht (Folge 934)',
'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -50,7 +49,7 @@ class NowTVIE(InfoExtractor):
'duration': 2641,
},
'params': {
- # m3u8 download
+ # rtmp download
'skip_download': True,
},
}, {
@@ -59,7 +58,7 @@ class NowTVIE(InfoExtractor):
'info_dict': {
'id': '165780',
'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Hals- und Beinbruch',
'description': 'md5:b50d248efffe244e6f56737f0911ca57',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -68,7 +67,7 @@ class NowTVIE(InfoExtractor):
'duration': 2742,
},
'params': {
- # m3u8 download
+ # rtmp download
'skip_download': True,
},
}, {
@@ -77,7 +76,7 @@ class NowTVIE(InfoExtractor):
'info_dict': {
'id': '99205',
'display_id': 'medicopter-117/angst',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Angst!',
'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -86,7 +85,7 @@ class NowTVIE(InfoExtractor):
'duration': 3025,
},
'params': {
- # m3u8 download
+ # rtmp download
'skip_download': True,
},
}, {
@@ -95,7 +94,7 @@ class NowTVIE(InfoExtractor):
'info_dict': {
'id': '203521',
'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -104,7 +103,7 @@ class NowTVIE(InfoExtractor):
'duration': 1083,
},
'params': {
- # m3u8 download
+ # rtmp download
'skip_download': True,
},
}, {
@@ -113,7 +112,7 @@ class NowTVIE(InfoExtractor):
'info_dict': {
'id': '128953',
'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': "Büro-Fall / Chihuahua 'Joel'",
'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -122,15 +121,19 @@ class NowTVIE(InfoExtractor):
'duration': 3092,
},
'params': {
- # m3u8 download
+ # rtmp download
'skip_download': True,
},
+ }, {
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
- station = mobj.group('station')
+ display_id = self._match_id(url)
info = self._download_json(
'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id,
@@ -148,29 +151,19 @@ class NowTVIE(InfoExtractor):
raise ExtractorError(
'Video %s is not available for free' % video_id, expected=True)
- f = info.get('format', {})
- station = f.get('station') or station
-
- STATIONS = {
- 'rtl': 'rtlnow',
- 'rtl2': 'rtl2now',
- 'vox': 'voxnow',
- 'nitro': 'rtlnitronow',
- 'ntv': 'n-tvnow',
- 'superrtl': 'superrtlnow'
- }
-
formats = []
for item in files['items']:
- item_path = remove_start(item['path'], '/')
- tbr = int_or_none(item['bitrate'])
- m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
- m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+ if determine_ext(item['path']) != 'f4v':
+ continue
+ app, play_path = remove_start(item['path'], '/').split('/', 1)
formats.append({
- 'url': m3u8_url,
- 'format_id': '%s-%sk' % (item['id'], tbr),
- 'ext': 'mp4',
- 'tbr': tbr,
+ 'url': 'rtmpe://fms.rtl.de',
+ 'app': app,
+ 'play_path': 'mp4:%s' % play_path,
+ 'ext': 'flv',
+ 'page_url': url,
+ 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf',
+ 'tbr': int_or_none(item.get('bitrate')),
})
self._sort_formats(formats)
@@ -178,6 +171,8 @@ class NowTVIE(InfoExtractor):
description = info.get('articleLong') or info.get('articleShort')
timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
duration = parse_duration(info.get('duration'))
+
+ f = info.get('format', {})
thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
return {
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
new file mode 100644
index 000000000..8ad936758
--- /dev/null
+++ b/youtube_dl/extractor/periscope.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import parse_iso8601
+
+
+class PeriscopeIE(InfoExtractor):
+ IE_DESC = 'Periscope'
+ _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
+ 'md5': '65b57957972e503fcbbaeed8f4fa04ca',
+ 'info_dict': {
+ 'id': '56102209',
+ 'ext': 'mp4',
+ 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗',
+ 'timestamp': 1438978559,
+ 'upload_date': '20150807',
+ 'uploader': 'Bec Boop',
+ 'uploader_id': '1465763',
+ },
+ 'skip': 'Expires in 24 hours',
+ }
+
+ def _call_api(self, method, token):
+ return self._download_json(
+ 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token)
+
+ def _real_extract(self, url):
+ token = self._match_id(url)
+
+ broadcast_data = self._call_api('getBroadcastPublic', token)
+ broadcast = broadcast_data['broadcast']
+ status = broadcast['status']
+
+ uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
+ uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
+
+ title = '%s - %s' % (uploader, status) if uploader else status
+ state = broadcast.get('state').lower()
+ if state == 'running':
+ title = self._live_title(title)
+ timestamp = parse_iso8601(broadcast.get('created_at'))
+
+ thumbnails = [{
+ 'url': broadcast[image],
+ } for image in ('image_url', 'image_url_small') if broadcast.get(image)]
+
+ stream = self._call_api('getAccessPublic', token)
+
+ formats = []
+ for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
+ video_url = stream.get(format_id + '_url')
+ if not video_url:
+ continue
+ f = {
+ 'url': video_url,
+ 'ext': 'flv' if format_id == 'rtmp' else 'mp4',
+ }
+ if format_id != 'rtmp':
+ f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8'
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': broadcast.get('id') or token,
+ 'title': title,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
+
+
+class QuickscopeIE(InfoExtractor):
+ IE_DESC = 'Quick Scope'
+ _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://watchonperiscope.com/broadcast/56180087',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ broadcast_id = self._match_id(url)
+ request = compat_urllib_request.Request(
+ 'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({
+ 'broadcast_id': broadcast_id,
+ 'entry_ticket': '',
+ 'from_push': 'false',
+ 'uses_sessions': 'true',
+ }).encode('utf-8'))
+ return self.url_result(
+ self._download_json(request, broadcast_id)['share_url'], 'Periscope')
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 0b7886840..7b0cdc41a 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor):
comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
- video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+ video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))
if webpage.find('"encrypted":true') != -1:
password = compat_urllib_parse_unquote_plus(
self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
@@ -94,7 +94,7 @@ class PornHubIE(InfoExtractor):
format = path.split('/')[5].split('_')[:2]
format = "-".join(format)
- m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
+ m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
if m is None:
height = None
tbr = None
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
index d1ab66b32..3bc84989e 100644
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -1,12 +1,11 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
+ js_to_json,
)
@@ -22,59 +21,48 @@ class ScreenwaveMediaIE(InfoExtractor):
video_id = self._match_id(url)
playerdata = self._download_webpage(
- 'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id,
+ 'http://player.screenwavemedia.com/player.php?id=%s' % video_id,
video_id, 'Downloading player webpage')
vidtitle = self._search_regex(
r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
- vidurl = self._search_regex(
- r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/')
-
- videolist_url = None
-
- mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
- if mobj:
- videoserver = mobj.group('videoserver')
- mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
- vidid = mobj.group('vidid') if mobj else video_id
- videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
- else:
- mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
- if mobj:
- videolist_url = mobj.group('smil')
-
- if videolist_url:
- videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
- formats = []
- baseurl = vidurl[:vidurl.rfind('/') + 1]
- for video in videolist.findall('.//video'):
- src = video.get('src')
- if not src:
- continue
- file_ = src.partition(':')[-1]
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'), scale=1000)
- format = {
- 'url': baseurl + file_,
- 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
- }
- if width or height:
- format.update({
- 'tbr': bitrate,
- 'width': width,
- 'height': height,
- })
- else:
- format.update({
- 'abr': bitrate,
- 'vcodec': 'none',
- })
- formats.append(format)
- else:
- formats = [{
- 'url': vidurl,
- }]
+
+ playerconfig = self._download_webpage(
+ 'http://player.screenwavemedia.com/player.js',
+ video_id, 'Downloading playerconfig webpage')
+
+ videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver')
+
+ sources = self._parse_json(
+ js_to_json(
+ self._search_regex(
+ r"sources\s*:\s*(\[[^\]]+?\])", playerconfig,
+ 'sources',
+ ).replace(
+ "' + thisObj.options.videoserver + '",
+ videoserver
+ ).replace(
+ "' + playerVidId + '",
+ video_id
+ )
+ ),
+ video_id
+ )
+
+ formats = []
+ for source in sources:
+ if source['type'] == 'hls':
+ formats.extend(self._extract_m3u8_formats(source['file'], video_id))
+ else:
+ format_label = source.get('label')
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_label, 'height', default=None))
+ formats.append({
+ 'url': source['file'],
+ 'format': format_label,
+ 'ext': source.get('type'),
+ 'height': height,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py
index 7fb165a87..87b650468 100644
--- a/youtube_dl/extractor/southpark.py
+++ b/youtube_dl/extractor/southpark.py
@@ -45,6 +45,14 @@ class SouthParkDeIE(SouthParkIE):
'title': 'The Government Won\'t Respect My Privacy',
'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
},
+ }, {
+ # non-ASCII characters in initial URL
+ 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
+ 'playlist_count': 4,
+ }, {
+ # non-ASCII characters in redirect URL
+ 'url': 'http://www.southpark.de/alle-episoden/s18e09',
+ 'playlist_count': 4,
}]
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index c89de5ba4..84fe71aef 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -29,6 +29,8 @@ class TudouIE(InfoExtractor):
}
}]
+ _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
+
def _url_for_id(self, id, quality=None):
info_url = "http://v2.tudou.com/f?id=" + str(id)
if quality:
@@ -54,6 +56,10 @@ class TudouIE(InfoExtractor):
thumbnail_url = self._search_regex(
r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False)
+ player_url = self._search_regex(
+ r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']",
+ webpage, 'player URL', default=self._PLAYER_URL)
+
segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
segments = json.loads(segs_json)
# It looks like the keys are the arguments that have to be passed as
@@ -76,6 +82,9 @@ class TudouIE(InfoExtractor):
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
+ 'http_headers': {
+ 'Referer': player_url,
+ },
}
result.append(part_info)
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 73ce335b7..a2b6a35aa 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -7,12 +7,15 @@ import random
from .common import InfoExtractor
from ..compat import (
+ compat_parse_qs,
compat_str,
compat_urllib_parse,
+ compat_urllib_parse_urlparse,
compat_urllib_request,
)
from ..utils import (
ExtractorError,
+ parse_duration,
parse_iso8601,
)
@@ -185,7 +188,7 @@ class TwitchVodIE(TwitchItemBaseIE):
_ITEM_SHORTCUT = 'v'
_TEST = {
- 'url': 'http://www.twitch.tv/riotgames/v/6528877',
+ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
'info_dict': {
'id': 'v6528877',
'ext': 'mp4',
@@ -197,6 +200,7 @@ class TwitchVodIE(TwitchItemBaseIE):
'uploader': 'Riot Games',
'uploader_id': 'riotgames',
'view_count': int,
+ 'start_time': 310,
},
'params': {
# m3u8 download
@@ -216,6 +220,12 @@ class TwitchVodIE(TwitchItemBaseIE):
item_id, 'mp4')
self._prefer_source(formats)
info['formats'] = formats
+
+ parsed_url = compat_urllib_parse_urlparse(url)
+ query = compat_parse_qs(parsed_url.query)
+ if 't' in query:
+ info['start_time'] = parse_duration(query['t'][0])
+
return info
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index b4ad513a0..97315750f 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -4,7 +4,6 @@ import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
unified_strdate,
str_to_int,
int_or_none,
@@ -22,7 +21,7 @@ class XHamsterIE(InfoExtractor):
'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014',
- 'uploader_id': 'Ruseful2011',
+ 'uploader': 'Ruseful2011',
'duration': 893,
'age_limit': 18,
}
@@ -34,7 +33,7 @@ class XHamsterIE(InfoExtractor):
'ext': 'mp4',
'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914',
- 'uploader_id': 'jojo747400',
+ 'uploader': 'jojo747400',
'duration': 200,
'age_limit': 18,
}
@@ -46,12 +45,12 @@ class XHamsterIE(InfoExtractor):
]
def _real_extract(self, url):
- def extract_video_url(webpage):
- mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
- if mp4 is None:
- raise ExtractorError('Unable to extract media URL')
- else:
- return mp4.group(1)
+ def extract_video_url(webpage, name):
+ return self._search_regex(
+ [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
+ r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
+ r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
+ webpage, name, group='mp4')
def is_hd(webpage):
return '<div class=\'icon iconHD\'' in webpage
@@ -75,10 +74,14 @@ class XHamsterIE(InfoExtractor):
if upload_date:
upload_date = unified_strdate(upload_date)
- uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
- webpage, 'uploader id', default='anonymous')
+ uploader = self._html_search_regex(
+ r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)",
+ webpage, 'uploader', default='anonymous')
- thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
+ thumbnail = self._search_regex(
+ [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''',
+ r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''],
+ webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
webpage, 'duration', fatal=False))
@@ -97,7 +100,9 @@ class XHamsterIE(InfoExtractor):
hd = is_hd(webpage)
- video_url = extract_video_url(webpage)
+ format_id = 'hd' if hd else 'sd'
+
+ video_url = extract_video_url(webpage, format_id)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
@@ -108,7 +113,7 @@ class XHamsterIE(InfoExtractor):
mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
- video_url = extract_video_url(webpage)
+ video_url = extract_video_url(webpage, 'hd')
formats.append({
'url': video_url,
'format_id': 'hd',
@@ -122,7 +127,7 @@ class XHamsterIE(InfoExtractor):
'title': title,
'description': description,
'upload_date': upload_date,
- 'uploader_id': uploader_id,
+ 'uploader': uploader,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,