aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-08-21 11:57:52 +0200
committerPhilipp Hagemeister <phihag@phihag.de>2014-08-21 11:57:52 +0200
commit35f76e0061373ad344b3cbea30422c586abc16b5 (patch)
treec74508b28e396c9c81d0764315ed9263afa1ab24 /youtube_dl/extractor
parent3f338cd6de1e198e810ca8e0c85a346c9537a47f (diff)
parentf83dda12ad37d1b83142e2821e72f8e6c0b4405e (diff)
Merge remote-tracking branch 'origin/master'
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py4
-rw-r--r--youtube_dl/extractor/dfb.py2
-rw-r--r--youtube_dl/extractor/generic.py7
-rw-r--r--youtube_dl/extractor/howstuffworks.py134
-rw-r--r--youtube_dl/extractor/jove.py80
-rw-r--r--youtube_dl/extractor/mitele.py60
-rw-r--r--youtube_dl/extractor/pbs.py53
-rw-r--r--youtube_dl/extractor/rtlnl.py52
-rw-r--r--youtube_dl/extractor/teamcoco.py2
-rw-r--r--youtube_dl/extractor/yahoo.py15
10 files changed, 392 insertions, 17 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 381a5d999..be7616edc 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -130,6 +130,7 @@ from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
+from .howstuffworks import HowStuffWorksIE
from .huffpost import HuffPostIE
from .hypem import HypemIE
from .iconosquare import IconosquareIE
@@ -150,6 +151,7 @@ from .ivi import (
from .izlesene import IzleseneIE
from .jadorecettepub import JadoreCettePubIE
from .jeuxvideo import JeuxVideoIE
+from .jove import JoveIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
from .jpopsukitv import JpopsukiIE
@@ -181,6 +183,7 @@ from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
+from .mitele import MiTeleIE
from .mixcloud import MixcloudIE
from .mlb import MLBIE
from .mpora import MporaIE
@@ -255,6 +258,7 @@ from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtbf import RTBFIE
+from .rtlnl import RtlXlIE
from .rtlnow import RTLnowIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE
diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py
index cb8e06822..8049779b0 100644
--- a/youtube_dl/extractor/dfb.py
+++ b/youtube_dl/extractor/dfb.py
@@ -30,7 +30,7 @@ class DFBIE(InfoExtractor):
video_id)
video_info = player_info.find('video')
- f4m_info = self._download_xml(video_info.find('url').text, video_id)
+ f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
token_el = f4m_info.find('token')
manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index bcb076594..8e915735e 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -706,6 +706,13 @@ class GenericIE(InfoExtractor):
url = unescapeHTML(mobj.group('url'))
return self.url_result(url, ie='MTVServicesEmbedded')
+ # Look for embedded yahoo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Yahoo')
+
# Start with something easy: JW Player in SWFObject
found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if not found:
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
new file mode 100644
index 000000000..68684b997
--- /dev/null
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -0,0 +1,134 @@
+from __future__ import unicode_literals
+
+import re
+import json
+import random
+import string
+
+from .common import InfoExtractor
+from ..utils import find_xpath_attr
+
+
+class HowStuffWorksIE(InfoExtractor):
+ _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm'
+ _TESTS = [
+ {
+ 'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
+ 'info_dict': {
+ 'id': '450221',
+ 'display_id': 'cool-jobs-iditarod-musher',
+ 'ext': 'flv',
+ 'title': 'Cool Jobs - Iditarod Musher',
+ 'description': 'md5:82bb58438a88027b8186a1fccb365f90',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # md5 is not consistent
+ 'skip_download': True
+ }
+ },
+ {
+ 'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm',
+ 'info_dict': {
+ 'id': '553470',
+ 'display_id': 'deadliest-catch-jakes-farewell-pots',
+ 'ext': 'mp4',
+ 'title': 'Deadliest Catch: Jake\'s Farewell Pots',
+ 'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # md5 is not consistent
+ 'skip_download': True
+ }
+ },
+ {
+ 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm',
+ 'info_dict': {
+ 'id': '440011',
+ 'display_id': 'sword-swallowing-1-by-dan-meyer',
+ 'ext': 'flv',
+ 'title': 'Sword Swallowing #1 by Dan Meyer',
+ 'description': 'md5:b2409e88172913e2e7d3d1159b0ef735',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # md5 is not consistent
+ 'skip_download': True
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ webpage = self._download_webpage(url, display_id)
+
+ content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id')
+
+ mp4 = self._search_regex(
+ r'''(?xs)var\s+clip\s*=\s*{\s*
+ .+?\s*
+ content_id\s*:\s*%s\s*,\s*
+ .+?\s*
+ mp4\s*:\s*\[(.*?),?\]\s*
+ };\s*
+ videoData\.push\(clip\);''' % content_id,
+ webpage, 'mp4', fatal=False, default=None)
+
+ smil = self._download_xml(
+ 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id,
+ content_id, 'Downloading video SMIL')
+
+ http_base = find_xpath_attr(
+ smil,
+ './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
+ 'name',
+ 'httpBase').get('content')
+
+ def random_string(str_len=0):
+ return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)])
+
+ URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12))
+
+ formats = []
+
+ if mp4:
+ for video in json.loads('[%s]' % mp4):
+ bitrate = video['bitrate']
+ fmt = {
+ 'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX,
+ 'format_id': bitrate,
+ }
+ m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate)
+ if m:
+ fmt['vbr'] = int(m.group('vbr'))
+ formats.append(fmt)
+ else:
+ for video in smil.findall(
+ './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
+ vbr = int(video.attrib['system-bitrate']) / 1000
+ formats.append({
+ 'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX),
+ 'format_id': '%dk' % vbr,
+ 'vbr': vbr,
+ })
+
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage)
+ TITLE_SUFFIX = ' : HowStuffWorks'
+ if title.endswith(TITLE_SUFFIX):
+ title = title[:-len(TITLE_SUFFIX)]
+
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': content_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py
new file mode 100644
index 000000000..cf73cd753
--- /dev/null
+++ b/youtube_dl/extractor/jove.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate
+)
+
+
+class JoveIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
+ _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
+ _TESTS = [
+ {
+ 'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
+ 'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
+ 'info_dict': {
+ 'id': '2744',
+ 'ext': 'mp4',
+ 'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
+ 'description': 'md5:015dd4509649c0908bc27f049e0262c6',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'upload_date': '20110523',
+ }
+ },
+ {
+ 'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
+ 'md5': '914aeb356f416811d911996434811beb',
+ 'info_dict': {
+ 'id': '51796',
+ 'ext': 'mp4',
+ 'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
+ 'description': 'md5:35ff029261900583970c4023b70f1dc9',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'upload_date': '20140802',
+ }
+ },
+
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ chapters_id = self._html_search_regex(
+ r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
+
+ chapters_xml = self._download_xml(
+ self._CHAPTERS_URL.format(video_id=chapters_id),
+ video_id, note='Downloading chapters XML',
+ errnote='Failed to download chapters XML')
+
+ video_url = chapters_xml.attrib.get('video')
+ if not video_url:
+ raise ExtractorError('Failed to get the video URL')
+
+ title = self._html_search_meta('citation_title', webpage, 'title')
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._html_search_regex(
+ r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
+ webpage, 'description', fatal=False)
+ publish_date = unified_strdate(self._html_search_meta(
+ 'citation_publication_date', webpage, 'publish date', fatal=False))
+ comment_count = self._html_search_regex(
+ r'<meta name="num_comments" content="(\d+) Comments?"',
+ webpage, 'comment count', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'upload_date': publish_date,
+ 'comment_count': comment_count,
+ }
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
new file mode 100644
index 000000000..979f3d692
--- /dev/null
+++ b/youtube_dl/extractor/mitele.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ get_element_by_attribute,
+ parse_duration,
+ strip_jsonp,
+)
+
+
+class MiTeleIE(InfoExtractor):
+ IE_NAME = 'mitele.es'
+ _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/'
+
+ _TEST = {
+ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
+ 'md5': '6a75fe9d0d3275bead0cb683c616fddb',
+ 'info_dict': {
+ 'id': '0fce117d',
+ 'ext': 'mp4',
+ 'title': 'Programa 144 - Tor, la web invisible',
+ 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+ 'display_id': 'programa-144',
+ 'duration': 2913,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ episode = mobj.group('episode')
+ webpage = self._download_webpage(url, episode)
+ embed_data_json = self._search_regex(
+ r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
+ flags=re.DOTALL
+ ).replace('\'', '"')
+ embed_data = json.loads(embed_data_json)
+
+ info_url = embed_data['flashvars']['host']
+ info_el = self._download_xml(info_url, episode).find('./video/info')
+
+ video_link = info_el.find('videoUrl/link').text
+ token_query = compat_urllib_parse.urlencode({'id': video_link})
+ token_info = self._download_json(
+ 'http://token.mitele.es/?' + token_query, episode,
+ transform_source=strip_jsonp
+ )
+
+ return {
+ 'id': embed_data['videoId'],
+ 'display_id': episode,
+ 'title': info_el.find('title').text,
+ 'url': token_info['tokenizedUrl'],
+ 'description': get_element_by_attribute('class', 'text', webpage),
+ 'thumbnail': info_el.find('thumb').text,
+ 'duration': parse_duration(info_el.find('duration').text),
+ }
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index ec95d0704..dee4af6f1 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -20,17 +20,41 @@ class PBSIE(InfoExtractor):
)
'''
- _TEST = {
- 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
- 'md5': 'ce1888486f0908d555a8093cac9a7362',
- 'info_dict': {
- 'id': '2365006249',
- 'ext': 'mp4',
- 'title': 'A More Perfect Union',
- 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
- 'duration': 3190,
+ _TESTS = [
+ {
+ 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
+ 'md5': 'ce1888486f0908d555a8093cac9a7362',
+ 'info_dict': {
+ 'id': '2365006249',
+ 'ext': 'mp4',
+ 'title': 'A More Perfect Union',
+ 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
+ 'duration': 3190,
+ },
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
+ 'md5': '143c98aa54a346738a3d78f54c925321',
+ 'info_dict': {
+ 'id': '2365297690',
+ 'ext': 'mp4',
+ 'title': 'Losing Iraq',
+ 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+ 'duration': 5050,
+ },
},
- }
+ {
+ 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
+ 'md5': 'b19856d7f5351b17a5ab1dc6a64be633',
+ 'info_dict': {
+ 'id': '2201174722',
+ 'ext': 'mp4',
+ 'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
+ 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
+ 'duration': 801,
+ },
+ },
+ ]
def _extract_ids(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -40,10 +64,13 @@ class PBSIE(InfoExtractor):
if presumptive_id:
webpage = self._download_webpage(url, display_id)
- # frontline video embed
+ MEDIA_ID_REGEXES = [
+ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
+ r'class="coveplayerid">([^<]+)<', # coveplayer
+ ]
+
media_id = self._search_regex(
- r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",
- webpage, 'frontline video ID', fatal=False, default=None)
+ MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
if media_id:
return media_id, presumptive_id
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
new file mode 100644
index 000000000..14928cd62
--- /dev/null
+++ b/youtube_dl/extractor/rtlnl.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class RtlXlIE(InfoExtractor):
+ IE_NAME = 'rtlxl.nl'
+ _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+
+ _TEST = {
+ 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
+ 'info_dict': {
+ 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
+ 'ext': 'flv',
+ 'title': 'RTL Nieuws - Laat',
+ 'description': 'Dagelijks het laatste nieuws uit binnen- en '
+ 'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van '
+ 'onze mobiele apps.',
+ 'timestamp': 1408051800,
+ 'upload_date': '20140814',
+ },
+ 'params': {
+ # We download the first bytes of the first fragment, it can't be
+ # processed by the f4m downloader beacuse it isn't complete
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ uuid = mobj.group('uuid')
+
+ info = self._download_json(
+ 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+ uuid)
+ meta = info['meta']
+ material = info['material'][0]
+ episode_info = info['episodes'][0]
+
+ f4m_url = 'http://manifest.us.rtl.nl' + material['videopath']
+ progname = info['abstracts'][0]['name']
+ subtitle = material['title'] or info['episodes'][0]['name']
+
+ return {
+ 'id': uuid,
+ 'title': '%s - %s' % (progname, subtitle),
+ 'formats': self._extract_f4m_formats(f4m_url, uuid),
+ 'timestamp': material['original_date'],
+ 'description': episode_info['synopsis'],
+ }
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index f8dd7e955..fa796ce72 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor):
video_id = mobj.group("video_id")
if not video_id:
video_id = self._html_search_regex(
- r'<article class="video" data-id="(\d+?)"',
+ r'data-node-id="(\d+?)"',
webpage, 'video id')
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index d84be2562..0e3b33b16 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,7 +15,7 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
- _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
+ _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -46,12 +46,23 @@ class YahooIE(InfoExtractor):
'title': 'The World Loves Spider-Man',
'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
}
- }
+ },
+ {
+ 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
+ 'md5': '60e8ac193d8fb71997caa8fce54c6460',
+ 'info_dict': {
+ 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
+ 'ext': 'mp4',
+ 'title': "Yahoo Saves 'Community'",
+ 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
+ }
+ },
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ url = mobj.group('url')
webpage = self._download_webpage(url, video_id)
items_json = self._search_regex(