aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py3
-rw-r--r--youtube_dl/extractor/arte.py63
-rw-r--r--youtube_dl/extractor/common.py1
-rw-r--r--youtube_dl/extractor/statigram.py4
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/tf1.py35
-rw-r--r--youtube_dl/extractor/tumblr.py8
-rw-r--r--youtube_dl/extractor/tutv.py41
-rw-r--r--youtube_dl/extractor/vimeo.py2
-rw-r--r--youtube_dl/extractor/wat.py84
-rw-r--r--youtube_dl/extractor/youtube.py5
11 files changed, 231 insertions, 17 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index ec800d9fb..ba0e86713 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -47,14 +47,17 @@ from .statigram import StatigramIE
from .steam import SteamIE
from .teamcoco import TeamcocoIE
from .ted import TEDIE
+from .tf1 import TF1IE
from .traileraddict import TrailerAddictIE
from .tudou import TudouIE
from .tumblr import TumblrIE
+from .tutv import TutvIE
from .ustream import UstreamIE
from .vbox7 import Vbox7IE
from .vevo import VevoIE
from .vimeo import VimeoIE
from .vine import VineIE
+from .wat import WatIE
from .wimp import WimpIE
from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index b061b9566..183274eb7 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -11,11 +11,21 @@ from ..utils import (
)
class ArteTvIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+ """
+ There are two sources of video in arte.tv: videos.arte.tv and
+ www.arte.tv/guide, the extraction process is different for each one.
+ The videos expire in 7 days, so we can't add tests.
+ """
+ _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+ _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
_LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv'
+ @classmethod
+ def suitable(cls, url):
+ return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
+
# TODO implement Live Stream
# def extractLiveStream(self, url):
# video_lang = url.split('/')[-4]
@@ -44,17 +54,26 @@ class ArteTvIE(InfoExtractor):
# video_url = u'%s/%s' % (info.get('url'), info.get('path'))
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- # This is not a real id, it can be for example AJT for the news
- # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
- video_id = mobj.group('id')
+ mobj = re.match(self._EMISSION_URL, url)
+ if mobj is not None:
+ name = mobj.group('name')
+ # This is not a real id, it can be for example AJT for the news
+ # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
+ video_id = mobj.group('id')
+ return self._extract_emission(url, video_id)
+
+ mobj = re.match(self._VIDEOS_URL, url)
+ if mobj is not None:
+ id = mobj.group('id')
+ return self._extract_video(url, id)
if re.search(self._LIVE_URL, video_id) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
+ def _extract_emission(self, url, video_id):
+ """Extract from www.arte.tv/guide"""
webpage = self._download_webpage(url, video_id)
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
@@ -68,6 +87,7 @@ class ArteTvIE(InfoExtractor):
'description': player_info['VDE'],
'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
'thumbnail': player_info['programImage'],
+ 'ext': 'flv',
}
formats = player_info['VSR'].values()
@@ -78,9 +98,36 @@ class ArteTvIE(InfoExtractor):
if format_info['mediaType'] == u'rtmp':
info_dict['url'] = format_info['streamer']
info_dict['play_path'] = 'mp4:' + format_info['url']
- info_dict['ext'] = 'mp4'
else:
info_dict['url'] = format_info['url']
- info_dict['ext'] = 'mp4'
return info_dict
+
+ def _extract_video(self, url, video_id):
+ """Extract from videos.arte.tv"""
+ config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
+ config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
+ config_xml = self._download_webpage(config_xml_url, video_id)
+ config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
+ config_xml = self._download_webpage(config_xml_url, video_id)
+
+ video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
+ def _key(m):
+ quality = m.group('quality')
+ if quality == 'hd':
+ return 2
+ else:
+ return 1
+ # We pick the best quality
+ video_urls = sorted(video_urls, key=_key)
+ video_url = list(video_urls)[-1].group('url')
+
+ title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
+ thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
+ config_xml, 'thumbnail')
+ return {'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'url': video_url,
+ 'ext': 'flv',
+ }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 64d63e109..5c6fd7945 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ class InfoExtractor(object):
location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The subtitle file contents.
+ view_count: How many users have watched the video on the platform.
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib.request.urlopen
diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py
index 25bf5b85c..ae9a63e8b 100644
--- a/youtube_dl/extractor/statigram.py
+++ b/youtube_dl/extractor/statigram.py
@@ -10,7 +10,7 @@ class StatigramIE(InfoExtractor):
u'md5': u'deda4ff333abe2e118740321e992605b',
u'info_dict': {
u"uploader_id": u"videoseconds",
- u"title": u"Instagram photo by @videoseconds (Videos)"
+ u"title": u"Instagram photo by @videoseconds"
}
}
@@ -27,7 +27,7 @@ class StatigramIE(InfoExtractor):
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, u'title')
- title = html_title.rpartition(u' | Statigram')[0]
+ title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title)
uploader_id = self._html_search_regex(
r'@([^ ]+)', title, u'uploader name', fatal=False)
ext = 'mp4'
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 46b66582c..8b73b8340 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -17,7 +17,7 @@ class TEDIE(InfoExtractor):
_TEST = {
u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
u'file': u'102.mp4',
- u'md5': u'8cd9dfa41ee000ce658fd48fb5d89a61',
+ u'md5': u'2d76ee1576672e0bd8f187513267adf6',
u'info_dict': {
u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922",
u"title": u"Dan Dennett: The illusion of consciousness"
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
new file mode 100644
index 000000000..e0ffeced5
--- /dev/null
+++ b/youtube_dl/extractor/tf1.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+
+import json
+import re
+
+from .common import InfoExtractor
+
+class TF1IE(InfoExtractor):
+ """
+ TF1 uses the wat.tv player, currently it can only download videos with the
+ html5 player enabled, it cannot download HD videos.
+ """
+ _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
+ _TEST = {
+ u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
+ u'file': u'10635995.mp4',
+ u'md5': u'66789d3e91278d332f75e1feb7aea327',
+ u'info_dict': {
+ u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
+ u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ id = mobj.group(1)
+ webpage = self._download_webpage(url, id)
+ embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"',
+ webpage, 'embed url')
+ embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page')
+ wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
+ wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info')
+ wat_info = json.loads(wat_info)['media']
+ wat_url = wat_info['url']
+ return self.url_result(wat_url, 'Wat')
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 243f04bff..ad5840ca2 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -9,11 +9,11 @@ from ..utils import (
class TumblrIE(InfoExtractor):
_VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
_TEST = {
- u'url': u'http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja',
- u'file': u'53364321212.mp4',
- u'md5': u'0716d3dd51baf68a28b40fdf1251494e',
+ u'url': u'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
+ u'file': u'54196191430.mp4',
+ u'md5': u'479bb068e5b16462f5176a6828829767',
u'info_dict': {
- u"title": u"Rafael Lemos"
+ u"title": u"tatiana maslany news"
}
}
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
new file mode 100644
index 000000000..fcaa6ac01
--- /dev/null
+++ b/youtube_dl/extractor/tutv.py
@@ -0,0 +1,41 @@
+import base64
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+)
+
+class TutvIE(InfoExtractor):
+ _VALID_URL=r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'
+ _TEST = {
+ u'url': u'http://tu.tv/videos/noah-en-pabellon-cuahutemoc',
+ u'file': u'2742556.flv',
+ u'md5': u'5eb766671f69b82e528dc1e7769c5cb2',
+ u'info_dict': {
+ u"title": u"Noah en pabellon cuahutemoc"
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<meta property="og:title" content="(.*?)">', webpage, u'title')
+ internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
+
+ data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
+ data_content = self._download_webpage(data_url, video_id, note=u'Downloading video info')
+ data = compat_parse_qs(data_content)
+ video_url = base64.b64decode(data['kpt'][0]).decode('utf-8')
+ ext = video_url.partition(u'?')[0].rpartition(u'.')[2]
+
+ info = {
+ 'id': internal_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': title,
+ }
+ return [info]
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 11741e27d..7c4562790 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -16,7 +16,7 @@ class VimeoIE(InfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
- _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
+ _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
IE_NAME = u'vimeo'
_TEST = {
u'url': u'http://vimeo.com/56015672',
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
new file mode 100644
index 000000000..0d1302cd2
--- /dev/null
+++ b/youtube_dl/extractor/wat.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+
+import json
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ compat_urllib_parse,
+ unified_strdate,
+)
+
+
+class WatIE(InfoExtractor):
+ _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html'
+ IE_NAME = 'wat.tv'
+ _TEST = {
+ u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
+ u'file': u'10631273.mp4',
+ u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a',
+ u'info_dict': {
+ u'title': u'World War Z - Philadelphia VOST',
+ u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+ }
+ }
+
+ def download_video_info(self, real_id):
+ # 'contentv4' is used in the website, but it also returns the related
+ # videos, we don't need them
+ info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
+ info = json.loads(info)
+ return info['media']
+
+
+ def _real_extract(self, url):
+ def real_id_for_chapter(chapter):
+ return chapter['tc_start'].split('-')[0]
+ mobj = re.match(self._VALID_URL, url)
+ short_id = mobj.group('shortID')
+ webpage = self._download_webpage(url, short_id)
+ real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+
+ video_info = self.download_video_info(real_id)
+ chapters = video_info['chapters']
+ first_chapter = chapters[0]
+
+ if real_id_for_chapter(first_chapter) != real_id:
+ self.to_screen('Multipart video detected')
+ chapter_urls = []
+ for chapter in chapters:
+ chapter_id = real_id_for_chapter(chapter)
+ # Yes, when we this chapter is processed by WatIE,
+ # it will download the info again
+ chapter_info = self.download_video_info(chapter_id)
+ chapter_urls.append(chapter_info['url'])
+ entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
+ return self.playlist_result(entries, real_id, video_info['title'])
+
+ # Otherwise we can continue and extract just one part, we have to use
+ # the short id for getting the video url
+ player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id,
+ 'html5': '1'})
+ player_info = self._download_webpage('http://www.wat.tv/player?' + player_data,
+ real_id, u'Downloading player info')
+ player = json.loads(player_info)['player']
+ html5_player = self._html_search_regex(r'iframe src="(.*?)"', player,
+ 'html5 player')
+ player_webpage = self._download_webpage(html5_player, real_id,
+ u'Downloading player webpage')
+
+ video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage,
+ 'video url')
+ info = {'id': real_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': first_chapter['title'],
+ 'thumbnail': first_chapter['preview'],
+ 'description': first_chapter['description'],
+ 'view_count': video_info['views'],
+ }
+ if 'date_diffusion' in first_chapter:
+ info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
+
+ return info
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 96d8257d9..109c8a93f 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -168,7 +168,7 @@ class YoutubeIE(InfoExtractor):
self.to_screen(u'RTMP download detected')
def _decrypt_signature(self, s):
- """Decrypt the key"""
+ """Turn the encrypted s field into a working signature"""
if len(s) == 88:
return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
@@ -402,6 +402,9 @@ class YoutubeIE(InfoExtractor):
return video_id
def _real_extract(self, url):
+ if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
+ self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
+
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj: