aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py4
-rw-r--r--youtube_dl/extractor/addanime.py76
-rw-r--r--youtube_dl/extractor/appletrailers.py167
-rw-r--r--youtube_dl/extractor/cnn.py58
-rw-r--r--youtube_dl/extractor/common.py2
-rw-r--r--youtube_dl/extractor/googleplus.py4
-rw-r--r--youtube_dl/extractor/nbc.py33
-rw-r--r--youtube_dl/extractor/youtube.py2
8 files changed, 342 insertions, 4 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index eeeb3db50..c76b99a81 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,3 +1,5 @@
+from .appletrailers import AppleTrailersIE
+from .addanime import AddAnimeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import ArteTvIE
@@ -9,6 +11,7 @@ from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
+from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
from .condenast import CondeNastIE
@@ -53,6 +56,7 @@ from .muzu import MuzuTVIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .nba import NBAIE
+from .nbc import NBCNewsIE
from .ooyala import OoyalaIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
new file mode 100644
index 000000000..46db8262f
--- /dev/null
+++ b/youtube_dl/extractor/addanime.py
@@ -0,0 +1,76 @@
+import ast
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_HTTPError,
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+
+ ExtractorError,
+)
+
+
+class AddAnimeIE(InfoExtractor):
+
+ _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+ IE_NAME = u'AddAnime'
+ _TEST = {
+ u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+ u'file': u'24MR3YO5SAS9.flv',
+ u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1',
+ u'info_dict': {
+ u"description": u"One Piece 606",
+ u"title": u"One Piece 606"
+ }
+ }
+
+ def _real_extract(self, url):
+ try:
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+ webpage = self._download_webpage(url, video_id)
+ except ExtractorError as ee:
+ if not isinstance(ee.cause, compat_HTTPError):
+ raise
+
+ redir_webpage = ee.cause.read().decode('utf-8')
+ action = self._search_regex(
+ r'<form id="challenge-form" action="([^"]+)"',
+ redir_webpage, u'Redirect form')
+ vc = self._search_regex(
+ r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
+ redir_webpage, u'redirect vc value')
+ av = re.search(
+ r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
+ redir_webpage)
+ if av is None:
+ raise ExtractorError(u'Cannot find redirect math task')
+ av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3))
+
+ parsed_url = compat_urllib_parse_urlparse(url)
+ av_val = av_res + len(parsed_url.netloc)
+ confirm_url = (
+ parsed_url.scheme + u'://' + parsed_url.netloc +
+ action + '?' +
+ compat_urllib_parse.urlencode({
+ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
+ self._download_webpage(
+ confirm_url, video_id,
+ note=u'Confirming after redirect')
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(r"var normal_video_file = '(.*?)';",
+ webpage, u'video file URL')
+ video_title = self._og_search_title(webpage)
+ video_description = self._og_search_description(webpage)
+
+ return {
+ '_type': 'video',
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': video_title,
+ 'description': video_description
+ }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
new file mode 100644
index 000000000..b3bdb2955
--- /dev/null
+++ b/youtube_dl/extractor/appletrailers.py
@@ -0,0 +1,167 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+)
+
+
+class AppleTrailersIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _TEST = {
+ u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
+ u"playlist": [
+ {
+ u"file": u"manofsteel-trailer4.mov",
+ u"md5": u"11874af099d480cc09e103b189805d5f",
+ u"info_dict": {
+ u"duration": 111,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg",
+ u"title": u"Trailer 4",
+ u"upload_date": u"20130523",
+ u"uploader_id": u"wb",
+ },
+ },
+ {
+ u"file": u"manofsteel-trailer3.mov",
+ u"md5": u"07a0a262aae5afe68120eed61137ab34",
+ u"info_dict": {
+ u"duration": 182,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg",
+ u"title": u"Trailer 3",
+ u"upload_date": u"20130417",
+ u"uploader_id": u"wb",
+ },
+ },
+ {
+ u"file": u"manofsteel-trailer.mov",
+ u"md5": u"e401fde0813008e3307e54b6f384cff1",
+ u"info_dict": {
+ u"duration": 148,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg",
+ u"title": u"Trailer",
+ u"upload_date": u"20121212",
+ u"uploader_id": u"wb",
+ },
+ },
+ {
+ u"file": u"manofsteel-teaser.mov",
+ u"md5": u"76b392f2ae9e7c98b22913c10a639c97",
+ u"info_dict": {
+ u"duration": 93,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg",
+ u"title": u"Teaser",
+ u"upload_date": u"20120721",
+ u"uploader_id": u"wb",
+ },
+ }
+ ]
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ movie = mobj.group('movie')
+ uploader_id = mobj.group('company')
+
+ playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc'
+ playlist_snippet = self._download_webpage(playlist_url, movie)
+ playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet)
+ playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+
+ size_cache = {}
+
+ doc = xml.etree.ElementTree.fromstring(playlist_html)
+ playlist = []
+ for li in doc.findall('./div/ul/li'):
+ title = li.find('.//h3').text
+ video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
+ thumbnail = li.find('.//img').attrib['src']
+
+ date_el = li.find('.//p')
+ upload_date = None
+ m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text)
+ if m:
+ upload_date = u'20' + m.group('year') + m.group('month') + m.group('day')
+ runtime_el = date_el.find('./br')
+ m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail)
+ duration = None
+ if m:
+ duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
+
+ formats = []
+ for formats_el in li.findall('.//a'):
+ if formats_el.attrib['class'] != 'OverlayPanel':
+ continue
+ target = formats_el.attrib['target']
+
+ format_code = formats_el.text
+ if 'Automatic' in format_code:
+ continue
+
+ size_q = formats_el.attrib['href']
+ size_id = size_q.rpartition('#videos-')[2]
+ if size_id not in size_cache:
+ size_url = url + size_q
+ sizepage_html = self._download_webpage(
+ size_url, movie,
+ note=u'Downloading size info %s' % size_id,
+ errnote=u'Error while downloading size info %s' % size_id,
+ )
+ _doc = xml.etree.ElementTree.fromstring(sizepage_html)
+ size_cache[size_id] = _doc
+
+ sizepage_doc = size_cache[size_id]
+ links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a')
+ for vid_a in links:
+ href = vid_a.get('href')
+ if not href.endswith(target):
+ continue
+ detail_q = href.partition('#')[0]
+ detail_url = url + '/' + detail_q
+
+ m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q)
+ detail_id = m.group('detail_id')
+
+ detail_html = self._download_webpage(
+ detail_url, movie,
+ note=u'Downloading detail %s %s' % (detail_id, size_id),
+ errnote=u'Error while downloading detail %s %s' % (detail_id, size_id)
+ )
+ detail_doc = xml.etree.ElementTree.fromstring(detail_html)
+ movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a')
+ assert movie_link_el.get('class') == 'movieLink'
+ movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h')
+ ext = determine_ext(movie_link)
+ assert ext == 'mov'
+
+ formats.append({
+ 'format': format_code,
+ 'ext': ext,
+ 'url': movie_link,
+ })
+
+ info = {
+ '_type': 'video',
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'title': title,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'user_agent': 'QuickTime compatible (youtube-dl)',
+ }
+ # TODO: Remove when #980 has been merged
+ info['url'] = formats[-1]['url']
+ info['ext'] = formats[-1]['ext']
+
+ playlist.append(info)
+
+ return {
+ '_type': 'playlist',
+ 'id': movie,
+ 'entries': playlist,
+ }
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
new file mode 100644
index 000000000..a79f881cd
--- /dev/null
+++ b/youtube_dl/extractor/cnn.py
@@ -0,0 +1,58 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class CNNIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''
+
+ _TESTS = [{
+ u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
+ u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
+ u'md5': u'3e6121ea48df7e2259fe73a0628605c4',
+ u'info_dict': {
+ u'title': u'Nadal wins 8th French Open title',
+ u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+ },
+ },
+ {
+ u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
+ u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
+ u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e",
+ u"info_dict": {
+ u"title": "Student's epic speech stuns new freshmen",
+ u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\""
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ path = mobj.group('path')
+ page_title = mobj.group('title')
+ info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
+ info_xml = self._download_webpage(info_url, page_title)
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+ formats = []
+ for f in info.findall('files/file'):
+ mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate'])
+ if mf is not None:
+ formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text))
+ formats = sorted(formats)
+ (_,_,_, video_path) = formats[-1]
+ video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path
+
+ thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
+ thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
+
+ return {'id': info.attrib['id'],
+ 'title': info.find('headline').text,
+ 'url': video_url,
+ 'ext': determine_ext(video_url),
+ 'thumbnail': thumbnails[-1][1],
+ 'thumbnails': thumbs_dict,
+ 'description': info.find('description').text,
+ }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 52c4483c9..12169b2bb 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -129,7 +129,7 @@ class InfoExtractor(object):
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index 9f7fc19a4..f1cd88983 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor):
webpage, 'title', default=u'NA')
# Step 2, Simulate clicking the image box to launch video
- DOMAIN = 'https://plus.google.com'
- video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
+ DOMAIN = 'https://plus.google.com/'
+ video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
webpage, u'video page URL')
if not video_page.startswith(DOMAIN):
video_page = DOMAIN + video_page
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
new file mode 100644
index 000000000..3bc9dae6d
--- /dev/null
+++ b/youtube_dl/extractor/nbc.py
@@ -0,0 +1,33 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import find_xpath_attr, compat_str
+
+
+class NBCNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
+ u'file': u'52753292.flv',
+ u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
+ u'info_dict': {
+ u'title': u'Crew emerges after four-month Mars food study',
+ u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+
+ return {'id': video_id,
+ 'title': info.find('headline').text,
+ 'ext': 'flv',
+ 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
+ 'description': compat_str(info.find('caption').text),
+ 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index af01c9da0..8e486afd0 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif len(s) == 89:
return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
elif len(s) == 88:
- return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
+ return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
elif len(s) == 87:
return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
elif len(s) == 86: