aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/canalplus.py64
-rw-r--r--youtube_dl/extractor/carambatv.py24
-rw-r--r--youtube_dl/extractor/cbsinteractive.py2
-rw-r--r--youtube_dl/extractor/chirbit.py13
-rw-r--r--youtube_dl/extractor/clipfish.py21
-rw-r--r--youtube_dl/extractor/cmt.py7
-rw-r--r--youtube_dl/extractor/crunchyroll.py1
-rw-r--r--youtube_dl/extractor/orf.py46
-rw-r--r--youtube_dl/extractor/safari.py12
-rw-r--r--youtube_dl/extractor/videomore.py5
10 files changed, 116 insertions, 79 deletions
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 6dab226af..1c3c41d26 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -6,11 +6,13 @@ import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
+ dict_get,
ExtractorError,
HEADRequest,
- unified_strdate,
- qualities,
int_or_none,
+ qualities,
+ remove_end,
+ unified_strdate,
)
@@ -43,47 +45,46 @@ class CanalplusIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
- 'md5': '41f438a4904f7664b91b4ed0dec969dc',
'info_dict': {
- 'id': '1192814',
+ 'id': '1405510',
+ 'display_id': 'pid1830-c-zapping',
'ext': 'mp4',
- 'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014",
- 'description': "Toute l'année 2014 dans un Zapping exceptionnel !",
- 'upload_date': '20150105',
+ 'title': 'Zapping - 02/07/2016',
+ 'description': 'Le meilleur de toutes les chaînes, tous les jours',
+ 'upload_date': '20160702',
},
}, {
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
'info_dict': {
'id': '1108190',
- 'ext': 'flv',
- 'title': 'Le labyrinthe - Boing super ranger',
+ 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger',
+ 'ext': 'mp4',
+ 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe',
'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
'upload_date': '20140724',
},
'skip': 'Only works from France',
}, {
- 'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231',
+ 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html',
+ 'md5': '4b47b12b4ee43002626b97fad8fb1de5',
'info_dict': {
- 'id': '1390231',
+ 'id': '1420213',
+ 'display_id': 'pid6318-videos-integrales',
'ext': 'mp4',
- 'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité",
- 'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6',
- 'upload_date': '20160512',
- },
- 'params': {
- 'skip_download': True,
+ 'title': 'TPMP ! Même le matin - Les 35H de Baba - 14/10/2016',
+ 'description': 'md5:f96736c1b0ffaa96fd5b9e60ad871799',
+ 'upload_date': '20161014',
},
+ 'skip': 'Only works from France',
}, {
- 'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224',
+ 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510',
'info_dict': {
- 'id': '1398334',
+ 'id': '1420176',
+ 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510',
'ext': 'mp4',
- 'title': "L'invité de Bruce Toussaint du 07/06/2016 - ",
- 'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324',
- 'upload_date': '20160607',
- },
- 'params': {
- 'skip_download': True,
+ 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ',
+ 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.',
+ 'upload_date': '20161014',
},
}, {
'url': 'http://m.canalplus.fr/?vid=1398231',
@@ -95,18 +96,17 @@ class CanalplusIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid')
site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]]
# Beware, some subclasses do not define an id group
- display_id = mobj.group('display_id') or video_id
+ display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html')
- if video_id is None:
- webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'],
- webpage, 'video id', group='id')
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(
+ [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)',
+ r'id=["\']canal_video_player(?P<id>\d+)'],
+ webpage, 'video id', group='id')
info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
video_data = self._download_json(info_url, video_id, 'Downloading video JSON')
diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py
index 5797fb951..66c0f900a 100644
--- a/youtube_dl/extractor/carambatv.py
+++ b/youtube_dl/extractor/carambatv.py
@@ -9,6 +9,8 @@ from ..utils import (
try_get,
)
+from .videomore import VideomoreIE
+
class CarambaTVIE(InfoExtractor):
_VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)'
@@ -62,14 +64,16 @@ class CarambaTVPageIE(InfoExtractor):
_VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/',
- 'md5': '',
+ 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86',
'info_dict': {
- 'id': '191910501',
- 'ext': 'mp4',
+ 'id': '475222',
+ 'ext': 'flv',
'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 2678.31,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ # duration reported by videomore is incorrect
+ 'duration': int,
},
+ 'add_ie': [VideomoreIE.ie_key()],
}
def _real_extract(self, url):
@@ -77,6 +81,16 @@ class CarambaTVPageIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
+ videomore_url = VideomoreIE._extract_url(webpage)
+ if videomore_url:
+ title = self._og_search_title(webpage)
+ return {
+ '_type': 'url_transparent',
+ 'url': videomore_url,
+ 'ie_key': VideomoreIE.ie_key(),
+ 'title': title,
+ }
+
video_url = self._og_search_property('video:iframe', webpage, default=None)
if not video_url:
diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py
index 821db20b2..57b18e81d 100644
--- a/youtube_dl/extractor/cbsinteractive.py
+++ b/youtube_dl/extractor/cbsinteractive.py
@@ -63,7 +63,7 @@ class CBSInteractiveIE(ThePlatformIE):
webpage = self._download_webpage(url, display_id)
data_json = self._html_search_regex(
- r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'",
+ r"data-(?:cnet|zdnet)-video(?:-uvp(?:js)?)?-options='([^']+)'",
webpage, 'data json')
data = self._parse_json(data_json, display_id)
vdata = data.get('video') or data['videos'][0]
diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py
index 61aed0167..f35df143a 100644
--- a/youtube_dl/extractor/chirbit.py
+++ b/youtube_dl/extractor/chirbit.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import base64
+import re
from .common import InfoExtractor
from ..utils import parse_duration
@@ -70,7 +71,6 @@ class ChirbitProfileIE(InfoExtractor):
'url': 'http://chirbit.com/ScarletBeauty',
'info_dict': {
'id': 'ScarletBeauty',
- 'title': 'Chirbits by ScarletBeauty',
},
'playlist_mincount': 3,
}
@@ -78,13 +78,10 @@ class ChirbitProfileIE(InfoExtractor):
def _real_extract(self, url):
profile_id = self._match_id(url)
- rss = self._download_xml(
- 'http://chirbit.com/rss/%s' % profile_id, profile_id)
+ webpage = self._download_webpage(url, profile_id)
entries = [
- self.url_result(audio_url.text, 'Chirbit')
- for audio_url in rss.findall('./channel/item/link')]
+ self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
+ for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
- title = rss.find('./channel/title').text
-
- return self.playlist_result(entries, profile_id, title)
+ return self.playlist_result(entries, profile_id)
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index 3a47f6fa4..bb52e0c6f 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
@@ -10,15 +11,15 @@ from ..utils import (
class ClipfishIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
- 'md5': '79bc922f3e8a9097b3d68a93780fd475',
+ 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
+ 'md5': '720563e467b86374c194bdead08d207d',
'info_dict': {
- 'id': '3966754',
+ 'id': '4343170',
'ext': 'mp4',
- 'title': 'FIFA 14 - E3 2013 Trailer',
- 'description': 'Video zu FIFA 14: E3 2013 Trailer',
- 'upload_date': '20130611',
- 'duration': 82,
+ 'title': 'S01 E01 - Ugly Americans - Date in der Hölle',
+ 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.',
+ 'upload_date': '20161005',
+ 'duration': 1291,
'view_count': int,
}
}
@@ -50,10 +51,14 @@ class ClipfishIE(InfoExtractor):
'tbr': int_or_none(video_info.get('bitrate')),
})
+ descr = video_info.get('descr')
+ if descr:
+ descr = descr.strip()
+
return {
'id': video_id,
'title': video_info['title'],
- 'description': video_info.get('descr'),
+ 'description': descr,
'formats': formats,
'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'),
'duration': int_or_none(video_info.get('media_length')),
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py
index ac3bdfe8f..7d3e9b0c9 100644
--- a/youtube_dl/extractor/cmt.py
+++ b/youtube_dl/extractor/cmt.py
@@ -26,7 +26,7 @@ class CMTIE(MTVIE):
'id': '1504699',
'ext': 'mp4',
'title': 'Still The King Ep. 109 in 3 Minutes',
- 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.',
+ 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.',
'timestamp': 1469421000.0,
'upload_date': '20160725',
},
@@ -42,3 +42,8 @@ class CMTIE(MTVIE):
'%s said: video is not available' % cls.IE_NAME, expected=True)
return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url)
+
+ def _extract_mgid(self, webpage):
+ return self._search_regex(
+ r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1',
+ webpage, 'mgid', group='mgid')
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index c38fd095a..cc141f68e 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -150,6 +150,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# rtmp
'skip_download': True,
},
+ 'skip': 'Video gone',
}, {
'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409',
'info_dict': {
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 6ae30679a..c7b107572 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -1,28 +1,28 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
import re
import calendar
import datetime
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
HEADRequest,
unified_strdate,
- ExtractorError,
strip_jsonp,
int_or_none,
float_or_none,
determine_ext,
remove_end,
+ unescapeHTML,
)
class ORFTVthekIE(InfoExtractor):
IE_NAME = 'orf:tvthek'
IE_DESC = 'ORF TVthek'
- _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics?/.+?|program/[^/]+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
_TESTS = [{
'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
@@ -51,26 +51,23 @@ class ORFTVthekIE(InfoExtractor):
'skip_download': True, # rtsp downloads
},
'_skip': 'Blocked outside of Austria / Germany',
+ }, {
+ 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
+ 'skip_download': True,
+ }, {
+ 'url': 'http://tvthek.orf.at/profile/Universum/35429',
+ 'skip_download': True,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- data_json = self._search_regex(
- r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
- all_data = json.loads(data_json)
-
- def get_segments(all_data):
- for data in all_data:
- if data['name'] in (
- 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM',
- 'Tracker::EPISODE_DETAIL_PAGE_OVER_TOPIC'):
- return data['values']['segments']
-
- sdata = get_segments(all_data)
- if not sdata:
- raise ExtractorError('Unable to extract segments')
+ data_jsb = self._parse_json(
+ self._search_regex(
+ r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
+ webpage, 'playlist', group='json'),
+ playlist_id, transform_source=unescapeHTML)['playlist']['videos']
def quality_to_int(s):
m = re.search('([0-9]+)', s)
@@ -79,8 +76,11 @@ class ORFTVthekIE(InfoExtractor):
return int(m.group(1))
entries = []
- for sd in sdata:
- video_id = sd['id']
+ for sd in data_jsb:
+ video_id, title = sd.get('id'), sd.get('title')
+ if not video_id or not title:
+ continue
+ video_id = compat_str(video_id)
formats = [{
'preference': -10 if fd['delivery'] == 'hls' else None,
'format_id': '%s-%s-%s' % (
@@ -88,7 +88,7 @@ class ORFTVthekIE(InfoExtractor):
'url': fd['src'],
'protocol': fd['protocol'],
'quality': quality_to_int(fd['quality']),
- } for fd in sd['playlist_item_array']['sources']]
+ } for fd in sd['sources']]
# Check for geoblocking.
# There is a property is_geoprotection, but that's always false
@@ -115,14 +115,14 @@ class ORFTVthekIE(InfoExtractor):
self._check_formats(formats, video_id)
self._sort_formats(formats)
- upload_date = unified_strdate(sd['created_date'])
+ upload_date = unified_strdate(sd.get('created_date'))
entries.append({
'_type': 'video',
'id': video_id,
- 'title': sd['header'],
+ 'title': title,
'formats': formats,
'description': sd.get('description'),
- 'duration': int(sd['duration_in_seconds']),
+ 'duration': int_or_none(sd.get('duration_in_seconds')),
'upload_date': upload_date,
'thumbnail': sd.get('image_full_url'),
})
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
index 8b35fd244..c3aec1edd 100644
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@@ -157,7 +157,14 @@ class SafariCourseIE(SafariBaseIE):
IE_NAME = 'safari:course'
IE_DESC = 'safaribooksonline.com online courses'
- _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)|
+ techbus\.safaribooksonline\.com
+ )
+ /(?P<id>[^/]+)/?(?:[#?]|$)
+ '''
_TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
@@ -170,6 +177,9 @@ class SafariCourseIE(SafariBaseIE):
}, {
'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
'only_matching': True,
+ }, {
+ 'url': 'http://techbus.safaribooksonline.com/9780134426365',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py
index 8a11ff848..7f2566586 100644
--- a/youtube_dl/extractor/videomore.py
+++ b/youtube_dl/extractor/videomore.py
@@ -86,6 +86,11 @@ class VideomoreIE(InfoExtractor):
mobj = re.search(
r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1',
webpage)
+ if not mobj:
+ mobj = re.search(
+ r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)',
+ webpage)
+
if mobj:
return mobj.group('url')