aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py11
-rw-r--r--youtube_dl/extractor/arte.py183
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py87
-rw-r--r--youtube_dl/extractor/nowvideo.py43
-rw-r--r--youtube_dl/extractor/rottentomatoes.py16
-rw-r--r--youtube_dl/extractor/videodetective.py30
6 files changed, 301 insertions, 69 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 226c3a762..d76945a48 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE
from .addanime import AddAnimeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
-from .arte import ArteTvIE
+from .arte import (
+ ArteTvIE,
+ ArteTVPlus7IE,
+ ArteTVCreativeIE,
+ ArteTVFutureIE,
+)
from .auengine import AUEngineIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
@@ -62,6 +67,7 @@ from .ign import IGNIE, OneUPIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
+from .internetvideoarchive import InternetVideoArchiveIE
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
@@ -83,6 +89,7 @@ from .nba import NBAIE
from .nbc import NBCNewsIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
+from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
@@ -92,6 +99,7 @@ from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
+from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
from .sina import SinaIE
@@ -121,6 +129,7 @@ from .veoh import VeohIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE
+from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 4707d7cca..5ee8a67b1 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
import re
import json
import xml.etree.ElementTree
@@ -7,15 +8,15 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
unified_strdate,
+ determine_ext,
+ get_element_by_id,
)
+# There are different sources of video in arte.tv, the extraction process
+# is different for each one. The videos usually expire in 7 days, so we can't
+# add tests.
+
class ArteTvIE(InfoExtractor):
- """
- There are two sources of video in arte.tv: videos.arte.tv and
- www.arte.tv/guide, the extraction process is different for each one.
- The videos expire in 7 days, so we can't add tests.
- """
- _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
_LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_LIVE_URL = r'index-[0-9]+\.html$'
@@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor):
@classmethod
def suitable(cls, url):
- return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))
+ return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
# TODO implement Live Stream
# from ..utils import compat_urllib_parse
@@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor):
# video_url = u'%s/%s' % (info.get('url'), info.get('path'))
def _real_extract(self, url):
- mobj = re.match(self._EMISSION_URL, url)
- if mobj is not None:
- lang = mobj.group('lang')
- # This is not a real id, it can be for example AJT for the news
- # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
- video_id = mobj.group('id')
- return self._extract_emission(url, video_id, lang)
-
mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None:
id = mobj.group('id')
@@ -80,59 +73,6 @@ class ArteTvIE(InfoExtractor):
# self.extractLiveStream(url)
# return
- def _extract_emission(self, url, video_id, lang):
- """Extract from www.arte.tv/guide"""
- webpage = self._download_webpage(url, video_id)
- json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
-
- json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
- self.report_extraction(video_id)
- info = json.loads(json_info)
- player_info = info['videoJsonPlayer']
-
- info_dict = {'id': player_info['VID'],
- 'title': player_info['VTI'],
- 'description': player_info.get('VDE'),
- 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
- 'thumbnail': player_info['programImage'],
- 'ext': 'flv',
- }
-
- formats = player_info['VSR'].values()
- def _match_lang(f):
- # Return true if that format is in the language of the url
- if lang == 'fr':
- l = 'F'
- elif lang == 'de':
- l = 'A'
- regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
- return any(re.match(r, f['versionCode']) for r in regexes)
- # Some formats may not be in the same language as the url
- formats = filter(_match_lang, formats)
- # Some formats use the m3u8 protocol
- formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats)
- # We order the formats by quality
- formats = sorted(formats, key=lambda f: int(f['height']))
- # Prefer videos without subtitles in the same language
- formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)
- # Pick the best quality
- def _format(format_info):
- info = {'ext': 'flv',
- 'width': format_info.get('width'),
- 'height': format_info.get('height'),
- }
- if format_info['mediaType'] == u'rtmp':
- info['url'] = format_info['streamer']
- info['play_path'] = 'mp4:' + format_info['url']
- else:
- info_dict['url'] = format_info['url']
- return info
- info_dict['formats'] = [_format(f) for f in formats]
- # TODO: Remove when #980 has been merged
- info_dict.update(info_dict['formats'][-1])
-
- return info_dict
-
def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
@@ -182,3 +122,110 @@ class ArteTvIE(InfoExtractor):
'ext': 'flv',
'thumbnail': self._og_search_thumbnail(webpage),
}
+
+
+class ArteTVPlus7IE(InfoExtractor):
+ IE_NAME = u'arte.tv:+7'
+ _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+
+ @classmethod
+ def _extract_url_info(cls, url):
+ mobj = re.match(cls._VALID_URL, url)
+ lang = mobj.group('lang')
+ # This is not a real id, it can be for example AJT for the news
+ # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
+ video_id = mobj.group('id')
+ return video_id, lang
+
+ def _real_extract(self, url):
+ video_id, lang = self._extract_url_info(url)
+ webpage = self._download_webpage(url, video_id)
+ return self._extract_from_webpage(webpage, video_id, lang)
+
+ def _extract_from_webpage(self, webpage, video_id, lang):
+ json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+
+ json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
+ self.report_extraction(video_id)
+ info = json.loads(json_info)
+ player_info = info['videoJsonPlayer']
+
+ info_dict = {
+ 'id': player_info['VID'],
+ 'title': player_info['VTI'],
+ 'description': player_info.get('VDE'),
+ 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]),
+ 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+ }
+
+ formats = player_info['VSR'].values()
+ def _match_lang(f):
+ if f.get('versionCode') is None:
+ return True
+ # Return true if that format is in the language of the url
+ if lang == 'fr':
+ l = 'F'
+ elif lang == 'de':
+ l = 'A'
+ regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
+ return any(re.match(r, f['versionCode']) for r in regexes)
+ # Some formats may not be in the same language as the url
+ formats = filter(_match_lang, formats)
+ # Some formats use the m3u8 protocol
+ formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
+ # We order the formats by quality
+ formats = sorted(formats, key=lambda f: int(f.get('height',-1)))
+ # Prefer videos without subtitles in the same language
+ formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None)
+ # Pick the best quality
+ def _format(format_info):
+ info = {
+ 'width': format_info.get('width'),
+ 'height': format_info.get('height'),
+ }
+ if format_info['mediaType'] == u'rtmp':
+ info['url'] = format_info['streamer']
+ info['play_path'] = 'mp4:' + format_info['url']
+ info['ext'] = 'flv'
+ else:
+ info['url'] = format_info['url']
+ info['ext'] = determine_ext(info['url'])
+ return info
+ info_dict['formats'] = [_format(f) for f in formats]
+ # TODO: Remove when #980 has been merged
+ info_dict.update(info_dict['formats'][-1])
+
+ return info_dict
+
+
+# It also uses the arte_vp_url url from the webpage to extract the information
+class ArteTVCreativeIE(ArteTVPlus7IE):
+ IE_NAME = u'arte.tv:creative'
+ _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)'
+
+ _TEST = {
+ u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
+ u'file': u'050489-002.mp4',
+ u'info_dict': {
+ u'title': u'Agentur Amateur #2 - Corporate Design',
+ },
+ }
+
+
+class ArteTVFutureIE(ArteTVPlus7IE):
+ IE_NAME = u'arte.tv:future'
+ _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
+ u'file': u'050940-003.mp4',
+ u'info_dict': {
+ u'title': u'Les champignons au secours de la planète',
+ },
+ }
+
+ def _real_extract(self, url):
+ anchor_id, lang = self._extract_url_info(url)
+ webpage = self._download_webpage(url, anchor_id)
+ row = get_element_by_id(anchor_id, webpage)
+ return self._extract_from_webpage(row, anchor_id, lang)
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
new file mode 100644
index 000000000..5986459d6
--- /dev/null
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -0,0 +1,87 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ compat_urllib_parse,
+ xpath_with_ns,
+ determine_ext,
+)
+
+
+class InternetVideoArchiveIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
+
+ _TEST = {
+ u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
+ u'file': u'452693.mp4',
+ u'info_dict': {
+ u'title': u'SKYFALL',
+ u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
+ u'duration': 156,
+ },
+ }
+
+ @staticmethod
+ def _build_url(query):
+ return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
+
+ @staticmethod
+ def _clean_query(query):
+ NEEDED_ARGS = ['publishedid', 'customerid']
+ query_dic = compat_urlparse.parse_qs(query)
+ cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
+ # Other player ids return m3u8 urls
+ cleaned_dic['playerid'] = '247'
+ cleaned_dic['videokbrate'] = '100000'
+ return compat_urllib_parse.urlencode(cleaned_dic)
+
+ def _real_extract(self, url):
+ query = compat_urlparse.urlparse(url).query
+ query_dic = compat_urlparse.parse_qs(query)
+ video_id = query_dic['publishedid'][0]
+ url = self._build_url(query)
+
+ flashconfiguration_xml = self._download_webpage(url, video_id,
+ u'Downloading flash configuration')
+ flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
+ file_url = flashconfiguration.find('file').text
+ file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
+ # Replace some of the parameters in the query to get the best quality
+ # and http links (no m3u8 manifests)
+ file_url = re.sub(r'(?<=\?)(.+)$',
+ lambda m: self._clean_query(m.group()),
+ file_url)
+ info_xml = self._download_webpage(file_url, video_id,
+ u'Downloading video info')
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ item = info.find('channel/item')
+
+ def _bp(p):
+ return xpath_with_ns(p,
+ {'media': 'http://search.yahoo.com/mrss/',
+ 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
+ formats = []
+ for content in item.findall(_bp('media:group/media:content')):
+ attr = content.attrib
+ f_url = attr['url']
+ formats.append({
+ 'url': f_url,
+ 'ext': determine_ext(f_url),
+ 'width': int(attr['width']),
+ 'bitrate': int(attr['bitrate']),
+ })
+ formats = sorted(formats, key=lambda f: f['bitrate'])
+
+ info = {
+ 'id': video_id,
+ 'title': item.find('title').text,
+ 'formats': formats,
+ 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
+ 'description': item.find('description').text,
+ 'duration': int(attr['duration']),
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
new file mode 100644
index 000000000..ab52ad401
--- /dev/null
+++ b/youtube_dl/extractor/nowvideo.py
@@ -0,0 +1,43 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import compat_urlparse
+
+
+class NowVideoIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)'
+ _TEST = {
+ u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+ u'file': u'0mw0yow7b6dxa.flv',
+ u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
+ u'info_dict': {
+ u"title": u"youtubedl test video _BaW_jenozKc.mp4"
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ webpage_url = 'http://www.nowvideo.ch/video/' + video_id
+ webpage = self._download_webpage(webpage_url, video_id)
+
+ self.report_extraction(video_id)
+
+ video_title = self._html_search_regex(r'<h4>(.*)</h4>',
+ webpage, u'video title')
+
+ video_key = self._search_regex(r'var fkzd="(.*)";',
+ webpage, u'video key')
+
+ api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
+ api_response = self._download_webpage(api_call, video_id,
+ u'Downloading API page')
+ video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
+
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': video_title,
+ }]
diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py
new file mode 100644
index 000000000..c79c39413
--- /dev/null
+++ b/youtube_dl/extractor/rottentomatoes.py
@@ -0,0 +1,16 @@
+from .videodetective import VideoDetectiveIE
+
+
+# It just uses the same method as videodetective.com,
+# the internetvideoarchive.com is extracted from the og:video property
+class RottenTomatoesIE(VideoDetectiveIE):
+ _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
+ u'file': '613340.mp4',
+ u'info_dict': {
+ u'title': u'TOY STORY 3',
+ u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
+ },
+ }
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
new file mode 100644
index 000000000..d89f84094
--- /dev/null
+++ b/youtube_dl/extractor/videodetective.py
@@ -0,0 +1,30 @@
+import re
+
+from .common import InfoExtractor
+from .internetvideoarchive import InternetVideoArchiveIE
+from ..utils import (
+ compat_urlparse,
+)
+
+
+class VideoDetectiveIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
+ u'file': u'194487.mp4',
+ u'info_dict': {
+ u'title': u'KICK-ASS 2',
+ u'description': u'md5:65ba37ad619165afac7d432eaded6013',
+ u'duration': 138,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ og_video = self._og_search_video_url(webpage)
+ query = compat_urlparse.urlparse(og_video).query
+ return self.url_result(InternetVideoArchiveIE._build_url(query),
+ ie=InternetVideoArchiveIE.ie_key())